コード例 #1
0
    def reload(self, bert_model, gpu):
        from pytorch_transformers import XLNetTokenizer, XLNetModel
        if bert_model.endswith('.tar.gz'):
            self.tokenizer = NoPickle(
                XLNetTokenizer.from_pretrained(bert_model.replace(
                    '.tar.gz', '-vocab.txt'),
                                               do_lower_case=self.lower))
        else:
            self.tokenizer = NoPickle(
                XLNetTokenizer.from_pretrained(bert_model,
                                               do_lower_case=self.lower))

        self.xlnet = NoPickle(XLNetModel.from_pretrained(bert_model))
        if gpu:
            self.xlnet = self.xlnet.cuda()
        self.output_dim = self.xlnet.d_model
        # self.max_len = self.xlnet.embeddings.position_embeddings.num_embeddings

        for p in self.xlnet.parameters():
            p.requires_grad = False

        if self.finetune_tune_last_n > 0:
            self.finetune_layers = self.xlnet.encoder.layer[
                -self.finetune_tune_last_n:]
            for p in self.finetune_layers.parameters():
                p.requires_grad = True
コード例 #2
0
def get_dataloader(myPath, max_len=128, batch_size=50):
    ## load data
    train_revs = get_data(myPath)

    ## tokenize inputs
    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',
                                               do_lower_case=False)
    tokenized_texts = [tokenizer.tokenize(rev) for rev in train_revs]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids,
                              maxlen=max_len,
                              dtype="long",
                              truncating="post",
                              padding="post")
    print('tokenized inputs')

    # Create a mask of 1s for each token followed by 0s for padding
    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)
    prediction_inputs = torch.tensor(input_ids, dtype=torch.long)
    prediction_masks = torch.tensor(attention_masks)
    prediction_labels = torch.zeros([len(train_revs)], dtype=torch.long)
    prediction_labels[:len(train_revs) // 2] = 1
    print("loaded tensors")

    prediction_data = TensorDataset(prediction_inputs, prediction_masks,
                                    prediction_labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data,
                                       sampler=prediction_sampler,
                                       batch_size=batch_size)
    return prediction_dataloader
    def __init__(
        self,
        pretrained_model_name_or_path: str = "xlnet-large-cased",
        layers: str = "1",
        pooling_operation: str = "first_last",
        use_scalar_mix: bool = False,
    ):
        """XLNet embeddings, as proposed in Yang et al., 2019.
        :param pretrained_model_name_or_path: name or path of XLNet model
        :param layers: comma-separated list of layers
        :param pooling_operation: defines pooling operation for subwords
        :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
        """
        super().__init__()

        self.tokenizer = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path)
        self.model = XLNetModel.from_pretrained(
            pretrained_model_name_or_path=pretrained_model_name_or_path, output_hidden_states=True
        )
        self.name = pretrained_model_name_or_path
        self.layers: List[int] = [int(layer) for layer in layers.split(",")]
        self.pooling_operation = pooling_operation
        self.use_scalar_mix = use_scalar_mix
        self.static_embeddings = True

        dummy_sentence: Sentence = Sentence()
        dummy_sentence.add_token(Token("hello"))
        embedded_dummy = self.embed(dummy_sentence)
        self.__embedding_length: int = len(
            embedded_dummy[0].get_token(1).get_embedding()
        )
コード例 #4
0
def xlnet_feature_extractor(examples):
    config = XLNetConfig.from_pretrained(model_name)
    tokenizer = XLNetTokenizer.from_pretrained(model_name)
    model = XLNetForFeatureExtraction(config)
    # input_ids = torch.tensor(tokenizer.encode(utterance_list[0])).unsqueeze(0)# Batch size 1

    features = convert_examples_to_features(
        examples,
        MAX_SEQ_LEN,
        tokenizer,
        cls_token_at_end=True,  # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        sep_token=tokenizer.sep_token,
        cls_token_segment_id=2 if True else 1,
        pad_on_left=True,  # pad on the left for xlnet
        pad_token_segment_id=4 if True else 0)

    input_ids_list = []
    input_mask_list = []
    segment_ids_list = []
    for feature in features:
        input_ids_list.append(feature.input_ids)
        input_mask_list.append(feature.input_mask)
        segment_ids_list.append(feature.segment_ids)

    input_ids_tensor = torch.tensor(input_ids_list)
    input_mask_tensor = torch.tensor(input_mask_list)
    segment_ids_tensor = torch.tensor(segment_ids_list)

    transformer_outputs = model(input_ids=input_ids_tensor,
                                attention_mask=input_mask_tensor,
                                token_type_ids=segment_ids_tensor)
    feature = transformer_outputs[:, -1]
    return feature
コード例 #5
0
def extractVocab(model_path=post_rec.XLNetBaseCased):
    tokenizer = XLNetTokenizer.from_pretrained(model_path)
    tokenizer = Seq2SeqAdapterTokenizer(tokenizer)
    vocab_file = os.path.join(model_path, "vocab.txt")

    tokens = []
    id = 0
    while id < 32000:
        try:
            word = tokenizer.decode([id])
        except:
            print(id, "exceeded!")
            break

        print(word, id)

        tokens.append(str(id))

        id += 1

    with open(vocab_file, "w", encoding="utf-8") as f:
        tokens = map(lambda w: w + "\n", tokens)
        f.writelines(tokens)
        print("**" * 20)
        print("write vocab.txt")
コード例 #6
0
def get_tokenizer(model_path=None, name="bert"):
    tokenizer = None

    if name == "bert":
        from pytorch_transformers import BertTokenizer
        tokenizer = BertTokenizer.from_pretrained(model_path)
        tokenizer = Seq2SeqAdapterTokenizer(tokenizer)
    if name == "gpt2":
        from pytorch_transformers import GPT2Tokenizer
        tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        tokenizer = Seq2SeqAdapterTokenizer(tokenizer)
    if name == "xlnet":
        from pytorch_transformers import XLNetTokenizer
        tokenizer = XLNetTokenizer.from_pretrained(model_path)
        tokenizer = Seq2SeqAdapterTokenizer(tokenizer)
    if name == "roberta":
        tokenizer = RoBertaTokenizer(model_path)

    if name == "simple":
        tokenizer = SimpleTokenizer()
    if name == "spacy":
        tokenizer = SpacyTokenizer()
    if name == "corenlp":
        tokenizer = CoreNLPTokenizer()

    if tokenizer is None:
        raise RuntimeError("tokenizer:{} is not supported!".format(name))

    return tokenizer
コード例 #7
0
def get_tokenizer(tokenizer_name):
    log.info(f"\tLoading Tokenizer {tokenizer_name}")
    if tokenizer_name.startswith("bert-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        # TransformerXL is trained on data pretokenized with MosesTokenizer
        tokenizer = MosesTokenizer()
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name == "MosesTokenizer":
        tokenizer = MosesTokenizer()
    elif tokenizer_name == "SplitChars":
        tokenizer = SplitCharsTokenizer()
    elif tokenizer_name == "":
        tokenizer = SpaceTokenizer()
    else:
        tokenizer = None
    return tokenizer
コード例 #8
0
ファイル: nli_train.py プロジェクト: yukioichida/xlnet-nli
def train(args, device):
    args.dataset_name = "MNLI"  # TODO: parametrize

    model_name = args.model_name
    log = get_train_logger(args)
    SEED = 42
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)

    log.info(f'Using device {device}')
    tokenizer = XLNetTokenizer.from_pretrained(model_name, do_lower_case=True)
    xlnet_config = XLNetConfig.from_pretrained(
        model_name,
        output_hidden_states=True,
        output_attentions=True,
        num_labels=3,
        finetuning_task=args.dataset_name)

    model = XLNetForSequenceClassification.from_pretrained(model_name,
                                                           config=xlnet_config)

    model.to(device)

    # Load features from datasets
    data_loader = MNLIDatasetReader(args, tokenizer, log)
    train_file = os.path.join(args.base_path, args.train_file)
    val_file = os.path.join(args.base_path, args.val_file)
    train_dataloader = data_loader.load_train_dataloader(train_file)
    val_dataloader = data_loader.load_val_dataloader(val_file)

    trainer = TrainModel(train_dataloader, val_dataloader, log)
    trainer.train(model, device, args)
コード例 #9
0
def TextPrep(sentimentData):
    sentimentData.Text = sentimentData.Text.apply(nltk.tokenize.sent_tokenize)
    sentimentData.Text = sentimentData.Text.apply(xlnetPrep)
    # Turns the string into a sequence of words
    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',
                                               do_lower_case=True)
    tokenizedText = sentimentData.Text.apply(tokenizer.tokenize)
    MAX_LEN = 128
    SEPToken = tokenizer.tokenize(' [SEP]')
    CLSToken = tokenizer.tokenize(' [CLS]')
    tokenizedText = tokenizedText.apply(
        lambda x: x[:MAX_LEN - 8] + SEPToken + CLSToken
        if len(x) > MAX_LEN - 4 else x + CLSToken)
    # Create token IDS
    input_ids = input_ids = tokenizedText.apply(
        tokenizer.convert_tokens_to_ids)
    input_ids = pad_sequences(input_ids,
                              maxlen=MAX_LEN,
                              dtype='long',
                              truncating='post',
                              padding='post')
    # Attention masks mark how many tokens are in a sentence
    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)
    return input_ids, attention_masks
コード例 #10
0
ファイル: main_xlnet.py プロジェクト: pj0616/n2c2_2019_medsts
def main():
    torch.cuda.empty_cache()
    parser = setup_parser()
    args = parser.parse_args()
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory already exists and is not empty.")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    args.n_gpu = torch.cuda.device_count()
    args.device = device
    set_seed(args)
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: {}".format(args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    ##Load Models
    config = XLNetConfig.from_pretrained(args.config_name)
    print('config: {}'.format(config))
    tokenizer = XLNetTokenizer.from_pretrained(
        args.text_encoder_checkpoint, do_lower_case=args.do_lower_case)
    text_encoder = XLNetModel.from_pretrained(args.text_encoder_checkpoint,
                                              config=config)
    graph_encoder = GraphEncoder(args.n_hidden, args.min_score)
    if args.graph_encoder_checkpoint:
        graph_encoder.gcnnet.load_state_dict(
            torch.load(args.graph_encoder_checkpoint))

    medsts_classifier = PairClassifier(config.hidden_size + args.n_hidden, 1)
    medsts_c_classifier = PairClassifier(config.hidden_size + args.n_hidden, 5)
    medsts_type_classifier = PairClassifier(config.hidden_size + args.n_hidden,
                                            4)
    model = MedstsNet(text_encoder, graph_encoder, medsts_classifier,
                      medsts_c_classifier, medsts_type_classifier, config)
    model.to(args.device)

    args.n_gpu = 1

    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info('global step = {}, average loss = {}'.format(
            global_step, tr_loss))
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        logger.info("saving model checkpoint to {}".format(args.output_dir))
        model_to_save = model.module if hasattr(model, 'module') else model
        # model_to_save.save_pretrained(args.output_dir)
        torch.save(model_to_save.state_dict(),
                   os.path.join(args.output_dir, 'saved_model.pth'))
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
コード例 #11
0
    def load(cls, vocab_file: str, cache_model: bool = True) -> XLNetTokenizer:
        if vocab_file in cls._cache:
            return PretrainedXLNetTokenizer._cache[vocab_file]

        model = XLNetTokenizer(vocab_file=vocab_file)
        if cache_model:
            cls._cache[vocab_file] = model

        return model
コード例 #12
0
    def __init__(
        self,
        gpu=-1,
        check_for_lowercase=True,
        embeddings_dim=0,
        verbose=True,
        path_to_pretrained="xlnet-base-cased",
        model_frozen=True,
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>",
    ):
        SeqIndexerBaseEmbeddings.__init__(
            self,
            gpu=gpu,
            check_for_lowercase=check_for_lowercase,
            zero_digits=True,
            bos_token=bos_token,
            eos_token=eos_token,
            pad=pad_token,
            unk=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            mask_token=mask_token,
            load_embeddings=True,
            embeddings_dim=embeddings_dim,
            verbose=verbose,
            isBert=False,
            isXlNet=True)

        print("create seq indexer Transformers from Model {}".format(
            path_to_pretrained))

        self.xlnet = True

        self.path_to_pretrained = path_to_pretrained
        self.tokenizer = XLNetTokenizer.from_pretrained(path_to_pretrained)
        self.config = XLNetConfig.from_pretrained(path_to_pretrained)
        self.emb = XLNetModel.from_pretrained(path_to_pretrained)
        self.frozen = model_frozen
        for param in self.emb.parameters():
            param.requires_grad = False
        for elem in [
                self.emb.word_embedding, self.emb.layer, self.emb.dropout
        ]:
            for param in elem.parameters():
                param.requires_grad = False

        if (not self.frozen):
            for param in self.emb.pooler.parameters():
                param.requires_grad = True
        self.emb.eval()
        print("XLNET model loaded succesifully")
コード例 #13
0
 def load_tokenizer(self):
     if self.model_configuration.is_xlnet:
         self.tokenizer = XLNetTokenizer.from_pretrained(self.model_configuration.bert_model,
                                                         do_lower_case=self.model_configuration.do_lower)
     elif not self.model_configuration.is_scibert:
         self.tokenizer = BertTokenizer.from_pretrained(self.model_configuration.bert_model,
                                                        do_lower_case=self.model_configuration.do_lower)
     else:
         self.tokenizer = BertTokenizer(self.model_configuration.vocab_file,
                                        do_lower_case=self.model_configuration.do_lower)
コード例 #14
0
ファイル: tokenizer.py プロジェクト: agentsolaris/superglue
def get_tokenizer(tokenizer_name):
    logger.info(f"Loading Tokenizer {tokenizer_name}")

    if tokenizer_name.startswith("xlnet"):
        do_lower_case = "uncased" in tokenizer_name
        tokenizer = XLNetTokenizer.from_pretrained(
            tokenizer_name, do_lower_case=do_lower_case
        )

    return tokenizer
コード例 #15
0
    def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = False):
        super(XLNet, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.max_seq_length = max_seq_length
        self.do_lower_case = do_lower_case

        self.xlnet = XLNetModel.from_pretrained(model_name_or_path)
        self.tokenizer = XLNetTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
        self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
        self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
コード例 #16
0
ファイル: common.py プロジェクト: ZengElva/nlp
    def __init__(self, language=Language.ENGLISHCASED, cache_dir="."):
        """Initializes the underlying pretrained XLNet tokenizer.

        Args:
            language (Language, optional): The pretrained model's language.
                                           Defaults to Language.ENGLISHCASED
        """
        self.tokenizer = XLNetTokenizer.from_pretrained(language.value,
                                                        cache_dir=cache_dir)
        self.language = language
コード例 #17
0
def gen_dataloader(_train_path,
                   _test_path,
                   batch_size,
                   preprocess_inputs=False,
                   tokenizer_type='bert-base-uncased',
                   input_len=128,
                   **kwargs):
    """
    Helper function that takes either just the train data path or both
    train and test data an outputs the appropriate dataloader instance

    kwargs are:
    for preprocessing:
    sample_size=None,
    weak_supervision=True
    max_len = 128
    filter_bad_rows = True
    tokenizer = DFAULT_TOKENIIZER
    
    For dataloaders:
    val_sample_dataloader=True
    pin_memory = False
    num_workers = 0
    """

    if 'bert' in tokenizer_type.lower():
        tokenizer = BertTokenizer.from_pretrained(tokenizer_type)
    elif 'xlnet' in tokenizer_type.lower():
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_type)
    else:
        raise NotImplementedError(
            'model {} is not implemented'.format(tokenizer_type))

    train_dataset = read_data_to_dataframe(_train_path)
    if preprocess_inputs:
        df_train = preprocess_model_inputs(train_dataset,
                                           tokenizer=tokenizer,
                                           output_len=input_len,
                                           **kwargs)
    else:
        df_train = train_dataset

    if _test_path:
        test_dataset = read_data_to_dataframe(_test_path)
        if preprocess_inputs:
            df_test = preprocess_model_inputs(test_dataset,
                                              tokenizer=tokenizer,
                                              **kwargs)
        else:
            df_test = test_dataset
        dl = TrainValDataloader(df_train, df_test, batch_size, kwargs)
        return dl

    dl = TrainValSplitDataloader(df_train, batch_size, kwargs)
    return dl
コード例 #18
0
ファイル: xlnet.py プロジェクト: nbaghel777/nlpaug
    def __init__(self, model_path='xlnet-base-cased', padding_text=None, device='cuda'):
        super().__init__()
        self.model_path = model_path
        self.device = device

        self.tokenizer = XLNetTokenizer.from_pretrained(model_path)
        self.model = XLNetLMHeadModel.from_pretrained(model_path)

        self.padding_text_idxes = self.tokenizer.encode(padding_text or self.PADDING_TEXT)

        self.model.to(device)
        self.model.eval()
コード例 #19
0
def run(args):
    nli_model_path = 'saved_models/xlnet-base-cased/'
    model_file = os.path.join(nli_model_path, 'pytorch_model.bin')
    config_file = os.path.join(nli_model_path, 'config.json')
    log = get_logger('conduct_test')
    model_name = 'xlnet-base-cased'
    tokenizer = XLNetTokenizer.from_pretrained(model_name, do_lower_case=True)
    xlnet_config = XLNetConfig.from_pretrained(config_file)
    model = XLNetForSequenceClassification.from_pretrained(model_file,
                                                           config=xlnet_config)
    dataset_reader = ConductDatasetReader(args, tokenizer, log)
    file_lines = dataset_reader.get_file_lines('data/dados.tsv')

    results = []
    softmax_fn = torch.nn.Softmax(dim=1)

    model.eval()
    with torch.no_grad():
        for line in tqdm(file_lines):
            premise, hypothesys, conflict = dataset_reader.parse_line(line)
            pair_word_ids, input_mask, pair_segment_ids = dataset_reader.convert_text_to_features(
                premise, hypothesys)
            tensor_word_ids = torch.tensor([pair_word_ids],
                                           dtype=torch.long,
                                           device=args.device)
            tensor_input_mask = torch.tensor([input_mask],
                                             dtype=torch.long,
                                             device=args.device)
            tensor_segment_ids = torch.tensor([pair_segment_ids],
                                              dtype=torch.long,
                                              device=args.device)
            model_input = {
                'input_ids': tensor_word_ids,  # word ids
                'attention_mask': tensor_input_mask,  # input mask
                'token_type_ids': tensor_segment_ids
            }
            outputs = model(**model_input)
            logits = outputs[0]
            nli_scores, nli_class = get_scores_and_class(logits, softmax_fn)
            nli_scores = nli_scores.detach().cpu().numpy()
            results.append({
                "conduct": premise,
                "complaint": hypothesys,
                "nli_class": nli_class,
                "nli_contradiction_score": nli_scores[0],
                "nli_entailment_score": nli_scores[1],
                "nli_neutral_score": nli_scores[2],
                "conflict": conflict
            })

    df = pd.DataFrame(results)
    df.to_csv('results/final_results.tsv', sep='\t', index=False)
コード例 #20
0
def add_pytorch_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in pytorch_transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name

    if tokenizer_name.startswith("bert-"):
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token
    if tokenizer_name.startswith("roberta-"):
        if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None:
            vocab_size -= 1
        else:
            log.info("Time to delete vocab_size-1 in preprocess.py !!!")
    # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove
    # this when they fix the problem

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added pytorch_transformers vocab (%s): %d tokens",
             tokenizer_name, len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
コード例 #21
0
 def __init__(self,
              chunck_size=64,
              max_length=35,
              device=torch.device('cuda:0')):
     super(XLNetClient, self).__init__()
     self.chunck_size = chunck_size
     self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
     self.max_length = max_length
     # load the model
     self.model = XLNetModel.from_pretrained('xlnet-large-cased')
     self.model.eval()
     self.device = device
     # move model to device
     self.model.to(self.device)
コード例 #22
0
 def __init__(self, args):
     self.args = args
     ## self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
     self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased',
                                                     do_lower_case=False)
     self.sep_token = '<sep>'
     self.cls_token = '<cls>'
     self.pad_token = '<pad>'
     self.tgt_bos = '<s>'
     self.tgt_eos = '</s>'
     self.tgt_sent_split = '<sep>'
     self.sep_vid = self.tokenizer.sp_model[self.sep_token]
     self.cls_vid = self.tokenizer.sp_model[self.cls_token]
     self.pad_vid = self.tokenizer.sp_model[self.pad_token]
コード例 #23
0
def load_wirte_xlnet_vocab(model_name, save_vocab_path):
    from pytorch_transformers import XLNetTokenizer
    tokenizer = XLNetTokenizer.from_pretrained(model_name)
    # tokenizer.save_vocabulary(vocab_path)
    vocab_dict = OrderedDict()
    vocab_size = tokenizer.vocab_size
    for i in range(vocab_size):
        token = tokenizer.convert_ids_to_tokens(i)
        vocab_dict[i] = token
    print(len(vocab_dict))
    print(tokenizer.vocab_size)
    with open(save_vocab_path, 'w') as writer:
        for k, v in vocab_dict.items():
            writer.write('{0}\n'.format(v))
コード例 #24
0
    def __init__(self, opt):
        self.opt = opt
        if 'roberta' in opt.pretrained_bert_name:
            tokenizer = RobertaTokenizer.from_pretrained(
                opt.pretrained_bert_name)
            transformer = RobertaModel.from_pretrained(
                opt.pretrained_bert_name, output_attentions=True)
        elif 'bert' in opt.pretrained_bert_name:
            tokenizer = BertTokenizer.from_pretrained(opt.pretrained_bert_name)
            transformer = BertModel.from_pretrained(opt.pretrained_bert_name,
                                                    output_attentions=True)
        elif 'xlnet' in opt.pretrained_bert_name:
            tokenizer = XLNetTokenizer.from_pretrained(
                opt.pretrained_bert_name)
            transformer = XLNetModel.from_pretrained(opt.pretrained_bert_name,
                                                     output_attentions=True)
        if 'bert' or 'xlnet' in opt.model_name:
            tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len)
            self.model = opt.model_class(transformer, opt).to(opt.device)
        # elif 'xlnet' in opt.model_name:
        #     tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len)
        #     self.model = opt.model_class(bert,opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)
        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(
                self.trainset, (len(self.trainset) - valset_len, valset_len))
        else:
            self.valset = self.testset

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
コード例 #25
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args,
                                        load_dataset(args,
                                                     'valid',
                                                     shuffle=False),
                                        args.batch_size,
                                        device,
                                        shuffle=False,
                                        is_test=False)

    tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased',
                                               do_lower_case=False,
                                               cache_dir=args.temp_dir)
    symbols = {
        'BOS': tokenizer.sp_model['<s>'],
        'EOS': tokenizer.sp_model['</s>'],
        'PAD': tokenizer.sp_model['<pad>'],
        'EOQ': tokenizer.sp_model['<unk>']
    }

    valid_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          train=False,
                          device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
コード例 #26
0
ファイル: squad2_reader.py プロジェクト: anoop2019/simple-qa
    def __init__(self, tokenizer_name="xlnet-large-cased", max_seq_len=384, doc_stride=128, max_query_len=64, is_training=True):
        """
            Reader for squad 2 dataset

            Input:
                - tokenizer_name: string, default is xlnet_base_cased, tokenizer model name or path
                - max_seq_len: int, default is 384, The maximum total input sequence length after WordPiece tokenization. Sequences longer than this will be truncated, and sequences shorter than this will be padded.
                - doc_stride: int,  default is 128, When splitting up a long document into chunks, how much stride to take between chunks.
                - max_query_len: int, default is 64, The maximum number of tokens for the question. Questions longer than this will be truncated to this length.
                - is_training: bool, default is True
        """
        self.is_training = is_training
        self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
        self.max_seq_len = max_seq_len
        self.doc_stride = doc_stride
        self.max_query_len = max_query_len
        self.tokenizer_name = tokenizer_name
コード例 #27
0
    def __init__(self,
                 args,
                 model,
                 vocab,
                 symbols,
                 global_scorer=None,
                 logger=None,
                 dump_beam=""):
        self.logger = logger
        self.cuda = args.visible_gpus != '-1'

        self.args = args
        self.model = model
        self.generator = self.model.generator
        self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased',
                                                        do_lower_case=False,
                                                        cache_dir="../temp")
        self.vocab = vocab
        self.symbols = symbols
        self.start_token = symbols['BOS']
        self.end_token = symbols['EOS']

        self.global_scorer = global_scorer
        self.beam_size = args.beam_size
        self.min_length = args.min_length
        self.max_length = args.max_length

        self.dump_beam = dump_beam

        # for debugging
        self.beam_trace = self.dump_beam != ""
        self.beam_accum = None

        tensorboard_log_dir = args.model_path

        self.tensorboard_writer = SummaryWriter(tensorboard_log_dir,
                                                comment="Unmt")

        if self.beam_trace:
            self.beam_accum = {
                "predicted_ids": [],
                "beam_parent_ids": [],
                "scores": [],
                "log_probs": []
            }
コード例 #28
0
    def init_params(self, model_name, pre_trained_model, f_lr=5e-5, f_eps = 1e-8):
        MODEL_CLASSES   = { "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer) }
        # self._config, self._model_class, self._tokenizer = MODEL_CLASSES[model_name]
        self._tokenizer = XLNetTokenizer.from_pretrained(pre_trained_model, do_lower_case=True)
        self._config    = XLNetConfig.from_pretrained(pre_trained_model, do_lower_case=True)
        self._model     = XLNetForQuestionAnswering.from_pretrained(pre_trained_model, config=self._config)
        self._model.to(self._device)

        no_decay = ['bias', 'LayerNorm.weight']
        weight_decay = 0.0 # Author's default parameter
        optimizer_grouped_parameters = [
                  {'params': [p for n, p in self._model.named_parameters() \
                          if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
                  {'params': [p for n, p in self._model.named_parameters() \
                          if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
                  ]
        # warmup_steps = 0.0
        self._optimizer = AdamW(optimizer_grouped_parameters, lr=f_lr, eps=f_eps)
コード例 #29
0
ファイル: Zmodel.py プロジェクト: cooelf/LIMIT-BERT
def get_xlnet(xlnet_model):
    # Avoid a hard dependency on BERT by only importing it if it's being used
    from pytorch_transformers import (WEIGHTS_NAME, XLNetModel, XLMConfig,
                                      XLMForSequenceClassification,
                                      XLMTokenizer, XLNetConfig,
                                      XLNetLMHeadModel,
                                      XLNetForSequenceClassification,
                                      XLNetTokenizer)
    print(xlnet_model)
    tokenizer = XLNetTokenizer.from_pretrained(xlnet_model)
    xlnet = XLNetLMHeadModel.from_pretrained(xlnet_model)

    # if bert_model.endswith('.tar.gz'):
    #     tokenizer = BertTokenizer.from_pretrained(bert_model.replace('.tar.gz', '-vocab.txt'), do_lower_case=bert_do_lower_case)
    # else:
    #     tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=bert_do_lower_case)
    # bert = BertModel.from_pretrained(bert_model)
    return tokenizer, xlnet
コード例 #30
0
 def __init__(self):
     self.vocab_root = "../data/vocab.txt"
     self.xlnet_config_root = "../data/config.json"
     self.pretrained_xlnet_root = "../data/pytorch_model.bin"
     self.tokenizer_root = "../data/spiece.model"
     self.raw_train_data_root = "../data/train.txt"
     self.split_train_data_root = "../data/split_train_data.json"
     self.raw_train_label_root = "../data/train_answer.csv"
     self.raw_test_data_root = "../data/dev.txt"
     self.model_root = "../model/"
     self.data_root = "../data/"
     self.idiom_vocab_root = "../data/idiomList.txt"
     self.prob_file = "../data/prob.csv"
     self.result_file = "../data/result.csv"
     self.raw_result_file = "../data/raw_result.csv"
     self.xlnet_learning_rate = 2e-5
     self.other_learning_rate = 1e-3
     self.max_seq_length = 128
     self.num_train_epochs = 100
     self.warmup_proportion = 0.01
     self.hidden_dropout_prob = 0.5
     self.num_workers = 8
     self.eval_ratio = 0.02
     with open(self.data_root + "idiom2index", mode="rb") as f1:
         self.idiom2index = pickle.load(f1)
     with open(self.data_root + "index2idiom", mode="rb") as f2:
         self.index2idiom = pickle.load(f2)
     self.use_gpu = t.cuda.is_available()
     self.device = t.device("cuda" if self.use_gpu else "cpu")
     self.n_gpu = t.cuda.device_count()
     self.train_batch_size = 10 * self.n_gpu * int(256 / self.max_seq_length)
     self.test_batch_size = 32 * self.n_gpu * int(256 / self.max_seq_length)
     self.logger = logging.getLogger("xlnetCloze_train")
     self.logger.setLevel(logging.INFO)
     self.writer = SummaryWriter('tensorlog')
     self.decay = 0.3
     self.min_lr = 5e-7
     self.patience = 1
     self.seed = 42
     self.show_loss_step = 200
     self.version = 30
     self.tokenizer = XLNetTokenizer.from_pretrained(self.tokenizer_root)