Esempio n. 1
0
def get_model_and_tokenizer(args, trainer_config, logger):
    if args.model_type == 'gpt':
        if args.single_input:
            model = OpenAIGPTLMHeadModel.from_pretrained('./openai-gpt')
        else:
            model = OpenAIGPTEncoderDecoderModel.from_pretrained('./openai-gpt')
        tokenizer = OpenAIGPTTokenizer.from_pretrained('./openai-gpt')
    elif args.model_type == 'dialogpt':
        if args.single_input:
            model = GPT2DoubleHeadsModel.from_pretrained('./dialogpt')
        else:
            model = GPT2EncoderDecoderModel.from_pretrained('./dialogpt')
        tokenizer = GPT2Tokenizer.from_pretrained('./dialogpt')
    elif args.model_type == 'seq2seq':
        seq2seq_vocab = Seq2seqVocab(trainer_config.train_datasets, trainer_config.valid_datasets,
                                     trainer_config.test_datasets, args.vocab_path, data_type=trainer_config.data_type,
                                     extend_exist_vocab=args.extend_exist_vocab)
        tokenizer = seq2seq_vocab.vocab
        args.dialog_embeddings = False
        model = TransformerSeq2Seq(args.emb_dim, args.hidden_dim, args.num_layers, args.heads, args.depth_size,
                               args.filter_size, tokenizer, args.pretrained_emb_file, args.pointer_gen, logger,
                                multi_input=not args.single_input, attention_pooling_type=args.attention_pooling_type,
                                label_smoothing=args.label_smoothing)
    else:
        if args.single_input:
            model = GPT2DoubleHeadsModel.from_pretrained('./gpt2-small')
        else:
            model = GPT2EncoderDecoderModel.from_pretrained('./gpt2-small')
        tokenizer = GPT2Tokenizer.from_pretrained('./gpt2-small')
    return model, tokenizer
    def test_tokenization_gpt2(self):
        # Given
        self.base_tokenizer = GPT2Tokenizer.from_pretrained(
            'gpt2', do_lower_case=True, cache_dir=self.test_dir)
        self.rust_tokenizer = PyGpt2Tokenizer(
            get_from_cache(
                self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
                ['gpt2']),
            get_from_cache(
                self.base_tokenizer.pretrained_vocab_files_map['merges_file']
                ['gpt2']))
        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a for example in self.examples],
            max_len=128,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for rust, baseline in zip(output_rust, output_baseline):
            assert (rust.token_ids == baseline['input_ids'])
            assert (rust.segment_ids == baseline['token_type_ids'])
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])
Esempio n. 3
0
def gpt2Tokenizer(*args, **kwargs):
    """
    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
    Peculiarities:
        - Byte-level BPE

    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * gpt2
    Keyword args:
    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
                    Default: None
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
             Default: None

    Example:
        >>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')

        >>> text = "Who was Jim Henson ?"
        >>> indexed_tokens = tokenizer.encode(tokenized_text)
    """
    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
Esempio n. 4
0
 def load(cls,
          pretrained_model_name_or_path,
          tokenizer_class=None,
          **kwargs):
     try:
         ret = super(CustomTokenizer,
                     cls).load(pretrained_model_name_or_path,
                               tokenizer_class, **kwargs)
         return ret
     except:  # Custom models
         pretrained_model_name_or_path = str(pretrained_model_name_or_path)
         if tokenizer_class is None:
             if "gpt2" in pretrained_model_name_or_path.lower():
                 tokenizer_class = "GPT2Tokenizer"
             else:
                 raise ValueError(
                     f"Could not infer tokenizer_class from model config or "
                     f"name '{pretrained_model_name_or_path}'. Set arg `tokenizer_class` "
                     f"in Tokenizer.load() to one of: AlbertTokenizer, XLMRobertaTokenizer, "
                     f"RobertaTokenizer, DistilBertTokenizer, BertTokenizer, XLNetTokenizer, "
                     f"CamembertTokenizer, ElectraTokenizer, DPRQuestionEncoderTokenizer,"
                     f"DPRContextEncoderTokenizer.")
             logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
         ret = None
         if tokenizer_class == "GPT2Tokenizer":
             ret = GPT2Tokenizer.from_pretrained(
                 pretrained_model_name_or_path, **kwargs)
             ret.pad_token = ret.unk_token
         if ret is None:
             raise Exception("Unable to load tokenizer")
         else:
             return ret
    def test_tokenization_gpt2(self):
        # Given
        self.base_tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
                                                            do_lower_case=True,
                                                            cache_dir=self.test_dir)
        self.rust_tokenizer = PyGpt2Tokenizer(
            get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['gpt2']),
            get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['merges_file']['gpt2']), do_lower_case=True
        )
        output_baseline = []
        for example in self.examples:
            output_baseline.append(self.base_tokenizer.encode_plus(example.text_a,
                                                                   add_special_tokens=True,
                                                                   return_overflowing_tokens=True,
                                                                   return_special_tokens_mask=True,
                                                                   max_length=128))

        # When
        output_rust = self.rust_tokenizer.encode_list([example.text_a for example in self.examples],
                                                      max_len=128,
                                                      truncation_strategy='longest_first',
                                                      stride=0)

        # Then
        for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)):
            assert rust.token_ids == baseline[
                'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \
                              f'Sentence a: {self.examples[idx].text_a} \n' \
                              f'Sentence b: {self.examples[idx].text_b} \n' \
                              f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \
                              f'Rust: {rust.token_ids} \n' \
                              f' Python {baseline["input_ids"]}'
            assert (rust.segment_ids == baseline['token_type_ids'])
            assert (rust.special_tokens_mask == baseline['special_tokens_mask'])
Esempio n. 6
0
	def test_init(self, load_PretrainedVocab):
		vocab = load_PretrainedVocab()
		vocab_file = './tests/dataloader/dummy_gpt2vocab/vocab.json'
		merges_file = './tests/dataloader/dummy_gpt2vocab/merges.txt'
		from transformers.tokenization_gpt2 import GPT2Tokenizer
		toker = PretrainedTokenizer(GPT2Tokenizer(vocab_file, merges_file, unk_token='<|endoftext|>'))
		assert vocab.tokenizer.get_setting_hash() == toker.get_setting_hash()
		assert vocab.get_setting_hash() == load_PretrainedVocab().get_setting_hash()
		assert vocab.get_vocab_hash() == load_PretrainedVocab().get_vocab_hash()
    def test_full_tokenizer(self):
        tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
        text = "lower newer"
        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
        tokens = tokenizer.tokenize(text, add_prefix_space=True)
        self.assertListEqual(tokens, bpe_tokens)

        input_tokens = tokens + [tokenizer.unk_token]
        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
    def __init__(self):
        self.tokenizer = GPT2Tok.from_pretrained("gpt2")
        self.tokenizer.max_len = 10000

        self.pad_tok, self.start_tok, self.end_tok = "<PAD>", " ST", " END"

        self.pad_id = 0
        self.start_id = self.tokenizer.encode(self.start_tok)[0]
        self.end_id = self.tokenizer.encode(self.end_tok)[0]
        self.vocab_size = self.tokenizer.vocab_size
Esempio n. 9
0
	def test_convert(self, load_PretrainedVocab):
		vocab = load_PretrainedVocab()
		vocab_file = './tests/dataloader/dummy_gpt2vocab/vocab.json'
		merges_file = './tests/dataloader/dummy_gpt2vocab/merges.txt'
		from transformers.tokenization_gpt2 import GPT2Tokenizer
		toker = GPT2Tokenizer(vocab_file, merges_file, unk_token='<|endoftext|>')
		assert vocab.frequent_vocab_size == vocab.all_vocab_size == len(toker.encoder)
		assert vocab.frequent_vocab_list == vocab.all_vocab_list == list(map(lambda x: x[0], sorted(toker.encoder.items(), key=lambda x: x[1])))
		
		tokens = ['A', 'Ġbeautiful', 'Ġdessert', 'Ġwaiting', 'Ġto', 'Ġbe', 'Ġshared', 'Ġby', 'Ġtwo', 'Ġpeople', '.']
		assert vocab.convert_ids_to_tokens(vocab.convert_tokens_to_ids(tokens, False)) == tokens
		assert vocab.convert_ids_to_tokens(vocab.convert_tokens_to_ids(tokens, True)) == tokens
Esempio n. 10
0
    def __init__(self, method):
        if "gpt" in method:
            from transformers.tokenization_gpt2 import GPT2Tokenizer
            self.tokenizer = GPT2Tokenizer.from_pretrained(method)
        elif "bert" in method:
            from transformers.tokenization_bert import BertTokenizer
            self.tokenizer = BertTokenizer.from_pretrained(method)
        else:
            raise ValueError(
                '`method` is invalid value {}, should be "gpt"/"bpe" or "bert"'
                .format(method))

        self._tokenizer_class_name = self.tokenizer.__class__.__name__
def prepare_tokenizer(args):
    tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_path)
    eod_token = tokenizer.encoder['<pad>']
    num_tokens = len(tokenizer)

    args.tokenizer_num_tokens = num_tokens
    args.eod_token = eod_token

    after = num_tokens
    while after % args.make_vocab_size_divisible_by != 0:
        after += 1

    args.vocab_size = after
    print(f"prepare tokenizer done, size {after}", flush=True)

    return tokenizer
Esempio n. 12
0
    def test_full_tokenizer(self):
        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                 "lo", "low", "er",
                 "low", "lowest", "newer", "wider", "<unk>"]
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
        special_tokens_map = {"unk_token": "<unk>"}

        with TemporaryDirectory() as tmpdirname:
            vocab_file = os.path.join(
                tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
            merges_file = os.path.join(
                tmpdirname, VOCAB_FILES_NAMES['merges_file'])
            with open(vocab_file, "w") as fp:
                fp.write(json.dumps(vocab_tokens))
            with open(merges_file, "w") as fp:
                fp.write("\n".join(merges))

            input_text = u"lower newer"
            output_text = u"lower<unk>newer"

            create_and_check_tokenizer_commons(
                self, input_text, output_text, GPT2Tokenizer, tmpdirname, **special_tokens_map)

            tokenizer = GPT2Tokenizer(
                vocab_file, merges_file, **special_tokens_map)
            text = "lower"
            bpe_tokens = ["low", "er"]
            tokens = tokenizer.tokenize(text)
            self.assertListEqual(tokens, bpe_tokens)

            input_tokens = tokens + [tokenizer.unk_token]
            input_bpe_tokens = [13, 12, 17]
            self.assertListEqual(
                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
                while j < len(bytes_rep):
                    if isinstance(bytes_rep[j], int):
                        text_list.append(''.join(bytes_rep[i:j]))
                        i = j
                        break
                    else:
                        j += 1

                if i != j:  # We reach end of string without any latin character
                    text_list.append(''.join(bytes_rep[i:]))
                    break
        return ' '.join(text_list)


if __name__ == '__main__':
    gpt2_en_tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
    bert_cn_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    cn_en_tokenizer = ChineseEnglishTokenizer(gpt2_en_tokenizer,
                                              bert_cn_tokenizer)

    texts = [
        'pokemon你好吗 我好吗digimon gokemon', '六六六六六六六六', '我爱你, 你爱我吗?', '我,要。吃?饭!!',
        'I come from 北京 , I wanna go to 香港',
        '666 I come from 深圳 , I wanna eat 角质 at chinese restaurant',
        "我 的 帅 管家", "就 那种 typical 的 那 种 偶像 剧",
        "没有 没 有 爸爸 爸爸 爸爸 不 要 跟 家里 在一起 了 因为", "它 不可以 把",
        "还 有 假 睫毛 瞳孔 放大 片 加 假 睫毛 然后 就 可以 把 一 个 丑 女 变 美女", "你 讲 你 讲 你 讲", "三倍",
        "那个 胸部 也 是 我 爸 那 时候 看 那个", "没有 无所谓 我 只是 说 它 可是 把 它 挤 出来 而已",
        "没有 我 本来 我 刚才 有 东西 要 讲 的 你 插 我 的 话", "我 刚刚 讲到 差 很大",
        "那个 我 看 台湾 节目 一 大 堆", "就 她们 卸妆 然后 整个 变得 超 夸张 welcome 也是",
        "他 本来 爸爸 本来 有 未婚妻 ok 然后 他 不要 跟 他 未婚妻 结婚 他 喜欢 另外 一 个 女孩子",
 def get_tokenizer(self, **kwargs):
     kwargs.update(self.special_tokens_map)
     return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
Esempio n. 15
0
def count_tokens(text):
    encoding = GPT2Tokenizer.from_pretrained("gpt2-xl")
    tokens = encoding(text)
    return len(tokens["input_ids"])
def train_cmodgpt2_and_augment(args):
    task_name = args.task_name
    os.makedirs(args.output_dir, exist_ok=True)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    os.makedirs(args.output_dir, exist_ok=True)
    processor = get_task_processor(task_name, args.data_dir)
    #label_list = processor.get_labels(task_name)

    # load train and dev data
    train_examples = processor.get_train_examples()
    dev_examples = processor.get_dev_examples()

    tokenizer = GPT2Tokenizer.from_pretrained(GPT2_MODEL,
                                              do_lower_case=True,
                                              cache_dir=args.cache)

    args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)

    model = GPT2LMHeadModel.from_pretrained(GPT2_MODEL, cache_dir=args.cache)

    model.to(device)

    # train data
    train_features = convert_examples_to_features(train_examples,
                                                  args.block_size, tokenizer,
                                                  args.seed)
    train_data = prepare_data(train_features)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    # dev data
    dev_features = convert_examples_to_features(dev_examples, args.block_size,
                                                tokenizer, args.seed)
    dev_data = prepare_data(dev_features)
    dev_sampler = SequentialSampler(dev_data)
    dev_dataloader = DataLoader(dev_data,
                                sampler=dev_sampler,
                                batch_size=args.train_batch_size)

    num_train_steps = int(
        len(train_features) / args.train_batch_size * args.num_train_epochs)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_features))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    # Prepare optimizer and schedule (linear warmup and decay)
    t_total = num_train_steps
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=1e-8)

    best_dev_loss = float('inf')
    for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
        avg_loss = 0.
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)

            inputs = {'input_ids': batch[0], 'labels': batch[1]}

            outputs = model(**inputs)
            loss = outputs[0]
            # loss = model(input_ids, segment_ids, input_mask, masked_ids)
            optimizer.zero_grad()
            loss.backward()
            avg_loss += loss.item()
            optimizer.step()
            model.zero_grad()
            if (step + 1) % 50 == 0:
                print("avg_loss: {}".format(avg_loss / 50))
            # avg_loss = 0.

        # eval on dev after every epoch
        dev_loss = compute_dev_loss(model, dev_dataloader)
        print("Epoch {}, Dev loss {}".format(epoch, dev_loss))
        if dev_loss < best_dev_loss:
            best_dev_loss = dev_loss
            print("Saving model. Best dev so far {}".format(best_dev_loss))
            save_model_path = os.path.join(args.output_dir, 'best_cmodgpt2.pt')
            torch.save(model.state_dict(), save_model_path)

    # augment data using the best model
    augment_train_data(model, tokenizer, train_examples, args)
    def setup_class(self):
        self.use_gpu = torch.cuda.is_available()
        self.test_dir = Path(tempfile.mkdtemp())

        self.base_tokenizer = GPT2Tokenizer.from_pretrained(
            'distilgpt2', do_lower_case=True, cache_dir=self.test_dir)
        self.rust_tokenizer = PyGpt2Tokenizer(
            get_from_cache(
                self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
                ['distilgpt2']),
            get_from_cache(
                self.base_tokenizer.pretrained_vocab_files_map['merges_file']
                ['distilgpt2']),
            do_lower_case=True)
        self.model = GPT2Model.from_pretrained('distilgpt2',
                                               output_attentions=False).eval()
        if self.use_gpu:
            self.model.cuda()
        #     Extracted from https://en.wikipedia.org/wiki/Deep_learning
        self.sentence_list = [
            'Deep learning (also known as deep structured learning or hierarchical learning) is part of a broader family of machine learning methods based on artificial neural networks.Learning can be supervised, semi-supervised or unsupervised.',
            'Deep learning is a class of machine learning algorithms that[11](pp199–200) uses multiple layers to progressively extract higher level features from the raw input.',
            'For example, in image processing, lower layers may identify edges, while higher layers may identify the concepts relevant to a human such as digits or letters or faces.',
            'Most modern deep learning models are based on artificial neural networks, specifically, Convolutional Neural Networks (CNN)s, although they can also include propositional formulas organized layer-wise in deep generative models.',
            'In deep learning, each level learns to transform its input data into a slightly more abstract and composite representation.',
            'In an image recognition application, the raw input may be a matrix of pixels; the first representational layer may abstract the pixels and encode edges; the second layer may compose and encode arrangements of edges;',
            'he third layer may encode a nose and eyes; and the fourth layer may recognize that the image contains a face. Importantly, a deep learning process can learn which features to optimally place in which level on its own.',
            '(Of course, this does not completely eliminate the need for hand-tuning; for example, varying numbers of layers and layer sizes can provide different degrees of abstraction.)[',
            'The word "deep" in "deep learning" refers to the number of layers through which the data is transformed. More precisely, deep learning systems have a substantial credit assignment path (CAP) depth. The CAP is the chain of transformations from input to output.',
            'CAPs describe potentially causal connections between input and output. For a feedforward neural network, the depth of the CAPs is that of the network and is the number of hidden layers plus one (as the output layer is also parameterized).',
            'For recurrent neural networks, in which a signal may propagate through a layer more than once, the CAP depth is potentially unlimited.[2] No universally agreed upon threshold of depth divides shallow learning from deep learning.',
            'CAP of depth 2 has been shown to be a universal approximator in the sense that it can emulate any function.[14] Beyond that, more layers do not add to the function approximator ability of the network.',
            'Deep models (CAP > 2) are able to extract better features than shallow models and hence, extra layers help in learning the features effectively. Deep learning architectures can be constructed with a greedy layer-by-layer method.',
            'Deep learning helps to disentangle these abstractions and pick out which features improve performance.[1]. For supervised learning tasks, deep learning methods eliminate feature engineering, by translating the data into compact intermediate representations',
            'Deep learning algorithms can be applied to unsupervised learning tasks. This is an important benefit because unlabeled data are more abundant than the labeled data. Examples of deep structures that can be trained in an unsupervised manner are neural history compressors and deep belief networks.',
            'Deep neural networks are generally interpreted in terms of the universal approximation theorem or probabilistic inference. The classic universal approximation theorem concerns the capacity of feedforward neural networks with a single hidden layer of finite size to approximate continuous functions.',
            'In 1989, the first proof was published by George Cybenko for sigmoid activation functions and was generalised to feed-forward multi-layer architectures in 1991 by Kurt Hornik.Recent work also showed that universal approximation also holds for non-bounded activation functions such as the rectified linear unit.',
            'he universal approximation theorem for deep neural networks concerns the capacity of networks with bounded width but the depth is allowed to grow. Lu et al. proved that if the width of a deep neural network with ReLU activation is strictly larger than the input dimension, then the network can approximate any Lebesgue integrable function',
            'The probabilistic interpretation[24] derives from the field of machine learning. It features inference, as well as the optimization concepts of training and testing, related to fitting and generalization, respectively',
            'More specifically, the probabilistic interpretation considers the activation nonlinearity as a cumulative distribution function. The probabilistic interpretation led to the introduction of dropout as regularizer in neural networks.',
            'The probabilistic interpretation was introduced by researchers including Hopfield, Widrow and Narendra and popularized in surveys such as the one by Bishop. The term Deep Learning was introduced to the machine learning community by Rina Dechter in 1986',
            'The first general, working learning algorithm for supervised, deep, feedforward, multilayer perceptrons was published by Alexey Ivakhnenko and Lapa in 1965.[32] A 1971 paper described already a deep network with 8 layers trained by the group method of data handling algorithm.',
            'Other deep learning working architectures, specifically those built for computer vision, began with the Neocognitron introduced by Kunihiko Fukushima in 1980.[34] In 1989, Yann LeCun et al. applied the standard backpropagation algorithm',
            'By 1991 such systems were used for recognizing isolated 2-D hand-written digits, while recognizing 3-D objects was done by matching 2-D images with a handcrafted 3-D object model. Weng et al. suggested that a human brain does not use a monolithic 3-D object model and in 1992 they published Cresceptron',
            'Because it directly used natural images, Cresceptron started the beginning of general-purpose visual learning for natural 3D worlds. Cresceptron is a cascade of layers similar to Neocognitron. But while Neocognitron required a human programmer to hand-merge features, Cresceptron learned an open number of features in each layer without supervision',
            'Cresceptron segmented each learned object from a cluttered scene through back-analysis through the network. Max pooling, now often adopted by deep neural networks (e.g. ImageNet tests), was first used in Cresceptron to reduce the position resolution by a factor of (2x2) to 1 through the cascade for better generalization',
            'In 1994, André de Carvalho, together with Mike Fairhurst and David Bisset, published experimental results of a multi-layer boolean neural network, also known as a weightless neural network, composed of a 3-layers self-organising feature extraction neural network module (SOFT) followed by a multi-layer classification neural network module (GSN)',
            'n 1995, Brendan Frey demonstrated that it was possible to train a network containing six fully connected layers and several hundred hidden units using the wake-sleep algorithm, co-developed with Peter Dayan and Hinton. Many factors contribute to the slow speed, including the vanishing gradient problem analyzed in 1991 by Sepp Hochreiter',
            'Simpler models that use task-specific handcrafted features such as Gabor filters and support vector machines (SVMs) were a popular choice in the 1990s and 2000s, because of artificial neural network\'s (ANN) computational cost and a lack of understanding of how the brain wires its biological networks.',
            'Both shallow and deep learning (e.g., recurrent nets) of ANNs have been explored for many years.[47][48][49] These methods never outperformed non-uniform internal-handcrafting Gaussian mixture model/Hidden Markov model (GMM-HMM) technology based on generative models of speech trained discriminatively.',
            'Key difficulties have been analyzed, including gradient diminishing[45] and weak temporal correlation structure in neural predictive models.[51][52] Additional difficulties were the lack of training data and limited computing power. Most speech recognition researchers moved away from neural nets to pursue generative modeling.',
            'An exception was at SRI International in the late 1990s. Funded by the US government\'s NSA and DARPA, SRI studied deep neural networks in speech and speaker recognition. The speaker recognition team led by Larry Heck achieved the first significant success with deep neural networks.',
            'While SRI experienced success with deep neural networks in speaker recognition, they were unsuccessful in demonstrating similar success in speech recognition. The principle of elevating "raw" features over hand-crafted optimization was first explored successfully in the architecture of deep autoencoder on the "raw" spectrogram'
        ]

        # Pre-allocate GPU memory
        tokens_list = [
            self.base_tokenizer.tokenize(sentence)
            for sentence in self.sentence_list
        ]
        features = [
            self.base_tokenizer.convert_tokens_to_ids(tokens)
            for tokens in tokens_list
        ]
        features = [
            self.base_tokenizer.prepare_for_model(input,
                                                  None,
                                                  add_special_tokens=True,
                                                  max_length=128)
            for input in features
        ]
        max_len = max([len(f['input_ids']) for f in features])
        features = [[
            f['input_ids'] + [0] * (max_len - len(f['input_ids']))
            for f in features
        ]]
        all_input_ids = torch.tensor(features, dtype=torch.long)

        if self.use_gpu:
            all_input_ids = all_input_ids.cuda()

        with torch.no_grad():
            _ = self.model(all_input_ids)[0].cpu().numpy()
 def setup_base_tokenizer(self):
     self.base_tokenizer = GPT2Tokenizer.from_pretrained(
         'distilgpt2', do_lower_case=True, cache_dir=self.test_dir)
Esempio n. 19
0
import argparse
import io
import math
import sys
import torch
from transformers.modeling_gpt2 import GPT2LMHeadModel
from transformers.tokenization_gpt2 import GPT2Tokenizer

MODEL_ID = 'gpt2-xl'
model = GPT2LMHeadModel.from_pretrained(MODEL_ID)
model.eval()
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_ID)


def score(sentence):
    # Based on:
    # https://github.com/huggingface/transformers/issues/1009#issuecomment-521588881
    tokenize_input = tokenizer.tokenize(sentence)
    eos_id = tokenizer.eos_token_id
    token_idxs = tokenizer.convert_tokens_to_ids(tokenize_input)
    token_idxs = torch.tensor([[eos_id] + token_idxs])

    with torch.no_grad():
        outputs = model(token_idxs, labels=token_idxs)
        loss, logits = outputs[:2]
        num_tokens = len(tokenize_input)
        total_logprob = -loss * num_tokens
        return total_logprob.cpu().item(), num_tokens


def main():
Esempio n. 20
0
def main():
    args = InputConfig().args

    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.ERROR)
    logger = logging.getLogger(__file__)
    if args.server_ip and args.server_port and args.local_rank in [-1, 0]:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    trainer_config = get_trainer_config(args)
    # with open('/apdcephfs/share_916081/rainyucao/transformer_chatbot_experiments/test_log', 'w') as f:
    #     a = []
    #     a.append('args local rank is ' + str(args.local_rank) + '\n')
    #     a.append('cuda count' + str(torch.cuda.device_count()) + '\n')
    #     if args.local_rank not in [-1, 0] and torch.cuda.device_count() == 1:
    #         args.local_rank = -1
    #     a.append('args local rank is ' + str(args.local_rank) + '\n')
    #     f.writelines(a)

    # Log only on main process
    if args.local_rank not in [-1, 0]:
        sys.stdout = open("./runs/log_distributed_{}".format(args.local_rank), "w")  # dump sdtout
        writer = DummyWriter()
        logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                            datefmt='%m/%d/%Y %H:%M:%S', level=logging.ERROR)
        logger = logging.getLogger(__file__)
    else:
        from datetime import datetime
        current_time = datetime.now().strftime('%b%d_%H-%M-%S')
        if args.single_input:
            comment = '_{}_{}_single'.format(args.model_type, args.data_type)
        else:
            if args.model_type == 'seq2seq':
                comment = '_seq2seq_multi_{}_{}'.format(args.data_type, args.attention_fusion_type)
            else:
                comment = '_{}_{}_{}_{}_{}'.format(args.model_type, args.data_type, args.attention_fusion_type,
                           ('sm' if args.shared_module == 1 else 'nm'), ('sa' if args.shared_attention == 1 else 'na'))
        logdir = os.path.join('runs', current_time + comment)
        writer = SummaryWriter(logdir=logdir)
        logger = config_logger(os.path.join(logdir, 'train.log'))

    log_dir = writer.logdir
    logger.info("Training args: {}".format(args))
    logger.info("trainer config: {}".format(trainer_config))
    interrupt_checkpoint_path = os.path.join(log_dir, trainer_config.interrupt_checkpoint_path)
    last_checkpoint_path = os.path.join(log_dir, trainer_config.last_checkpoint_path)
    best_checkpoint_path = os.path.join(log_dir, 'best_model')
    logger.info("Logging to {}".format(log_dir))  # Let's save everything on an experiment in the ./runs/XXX/directory
    if args.local_rank in [-1, 0]:
        with open(os.path.join(log_dir, "trainer_config.json"), "w") as f:
            json.dump(trainer_config, f)

    set_seed(trainer_config.seed)
    device = torch.device(trainer_config.device)

    parsed_train_data, parsed_valid_data, parsed_test_data = None, None, None
    if args.model_type == 'gpt':
        if args.single_input:
            model = OpenAIGPTLMHeadModel.from_pretrained('./openai-gpt')
        else:
            model = OpenAIGPTEncoderDecoderModel.from_pretrained('./openai-gpt')
        tokenizer = OpenAIGPTTokenizer.from_pretrained('./openai-gpt')
    elif args.model_type == 'dialogpt':
        if args.single_input:
            model = GPT2DoubleHeadsModel.from_pretrained('./dialogpt')
        else:
            model = GPT2EncoderDecoderModel.from_pretrained('./dialogpt')
        tokenizer = GPT2Tokenizer.from_pretrained('./dialogpt')
    elif args.model_type == 'seq2seq':
        seq2seq_vocab = Seq2seqVocab(trainer_config.train_datasets, trainer_config.valid_datasets,
                                 trainer_config.test_datasets, args.vocab_path, data_type=args.data_type)
        tokenizer = seq2seq_vocab.vocab
        parsed_train_data, parsed_valid_data, parsed_test_data = seq2seq_vocab.all_data[0], seq2seq_vocab.all_data[1], \
                                                                 seq2seq_vocab.all_data[2]
        args.dialog_embeddings = False
        model = TransformerSeq2Seq(args.emb_dim, args.hidden_dim, args.num_layers, args.heads, args.depth_size,
                               args.filter_size, tokenizer, args.pretrained_emb_file, args.pointer_gen, logger,
                                multi_input=not args.single_input, attention_fusion_type=args.attention_fusion_type)
    else:
        if args.single_input:
            model = GPT2DoubleHeadsModel.from_pretrained('./gpt2-small')
        else:
            model = GPT2EncoderDecoderModel.from_pretrained('./gpt2-small')
        tokenizer = GPT2Tokenizer.from_pretrained('./gpt2-small')


    if args.model_type in ['gpt', 'dialogpt', 'gpt2']:
        tokenizer, additional_length = modify_tokenizer(tokenizer, args.data_type)
        model.embeddings_size = 768
        model.n_embeddings = len(tokenizer)
        model.shared_attention = (args.shared_attention == 1)
        model.shared_module = (args.shared_module == 1)
        model.attention_fusion_type = args.attention_fusion_type
        model.single_input = args.single_input
        if args.model_type == 'gpt':
            model_embedding_weight = model.transformer.tokens_embed.weight
            model.transformer.tokens_embed = nn.Embedding(model.n_embeddings, 768)
            model.lm_head = nn.Linear(768, model.n_embeddings, bias=False)
            model.transformer.tokens_embed.weight.data[:-additional_length, :] = model_embedding_weight.data
            model.transformer.tokens_embed.weight.data[-additional_length:, :] = 0
            model.lm_head.weight = model.transformer.tokens_embed.weight
        else:
            model_embedding_weight = model.transformer.wte.weight
            model.transformer.wte = nn.Embedding(model.n_embeddings, 768)
            model.lm_head = nn.Linear(768, model.n_embeddings, bias=False)
            model.transformer.wte.weight.data[:-additional_length, :] = model_embedding_weight.data
            model.transformer.wte.weight.data[-additional_length:, :] = 0
            model.lm_head.weight = model.transformer.wte.weight

        if not args.single_input:
            model.reload_module_dict()
        model.sent_dialog_id = tokenizer.sent_dialog_id
    model.talker1_id = tokenizer.talker1_bos_id
    model.talker2_id = tokenizer.talker2_bos_id

    model.padding_idx = tokenizer.pad_id
    model.n_pos_embeddings = 512

    model.bos_id = tokenizer.bos_id
    model.eos_id = tokenizer.eos_id
    model.beam_size = args.beam_size
    model.diversity_groups = 1
    model.max_seq_len = 32
    model.dialog_embeddings = args.dialog_embeddings
    model.bs_temperature = args.bs_temperature
    model.bs_nucleus_p = args.bs_nucleus_p
    model.annealing_topk = args.annealing_topk
    model.length_penalty_coef = args.length_penalty
    model.vocab = None
    model.annealing = args.annealing
    model.diversity_coef = args.diversity_coef
    model.sample = False
    model.inference_mode = args.inference_mode
    model.response_k = args.response_k

    logger.info('loading datasets')
    train_dataset = FacebookDataset(trainer_config.train_datasets, tokenizer,
                                    max_lengths=model.n_pos_embeddings - 1,  # A bit restrictive here
                                    dialog_embeddings=args.dialog_embeddings,
                                    cache=trainer_config.train_datasets_cache,
                                    use_start_end=False,
                                    negative_samples=trainer_config.negative_samples,
                                    augment=trainer_config.persona_augment,
                                    aug_syn_proba=trainer_config.persona_aug_syn_proba,
                                    limit_size=trainer_config.limit_train_size,
                                    max_history_size=trainer_config.max_history_size,
                                    single_input=args.single_input,
                                    data_type=args.data_type,
                                    parsed_data=parsed_train_data)
    valid_dataset = FacebookDataset(trainer_config.valid_datasets, tokenizer,
                                    max_lengths=model.n_pos_embeddings - 1,  # A bit restrictive here
                                    dialog_embeddings=args.dialog_embeddings,
                                    cache=trainer_config.valid_datasets_cache,
                                    use_start_end=False,
                                    negative_samples=-1,  # Keep all negative samples
                                    augment=False,
                                    aug_syn_proba=0.0,
                                    limit_size=trainer_config.limit_eval_size,
                                    max_history_size=trainer_config.max_history_size,
                                    single_input=args.single_input,
                                    data_type=args.data_type,
                                    parsed_data=parsed_valid_data)
    test_dataset = FacebookDataset(trainer_config.test_datasets, tokenizer,
                                   max_lengths=model.n_pos_embeddings - 1,  # A bit restrictive here
                                   dialog_embeddings=args.dialog_embeddings,
                                   cache=trainer_config.test_datasets_cache,
                                   use_start_end=False,
                                   negative_samples=-1,  # Keep all negative samples
                                   augment=False,
                                   aug_syn_proba=0.0,
                                   limit_size=trainer_config.limit_eval_size,
                                   max_history_size=trainer_config.max_history_size,
                                   single_input=args.single_input,
                                   data_type=args.data_type,
                                   parsed_data=parsed_test_data)
    logger.info('train dataset {} valid dataset {} test dataset {}'
                .format(len(train_dataset), len(valid_dataset), len(test_dataset)))

    # if args.local_rank != -1:
    #     os.environ['MASTER_ADDR'] = 'localhost'
    #     os.environ['MASTER_PORT'] = '12355'
    #
    #     # initialize the process group
    #     torch.distributed.init_process_group("nccl", rank=args.local_rank, world_size=1)
    #     n = torch.cuda.device_count()
    #     device_ids = list(range(args.local_rank * n, (args.local_rank + 1) * n))
    #     torch.cuda.set_device(args.local_rank)
    #     device = torch.device('cuda', args.local_rank)
    #     transformer.distribute(device_ids[0], device_ids)
    '''Normal training will use normal trainer'''
    model_trainer = Trainer(model,
                            train_dataset,
                            writer,
                            logger=logger,
                            valid_dataset=valid_dataset,
                            test_dataset=test_dataset,
                            train_batch_size=trainer_config.train_batch_size,
                            batch_split=trainer_config.batch_split,
                            test_batch_size=trainer_config.test_batch_size,
                            lr=trainer_config.lr,
                            lr_warmup=trainer_config.lr_warmup,
                            weight_decay=trainer_config.weight_decay,
                            s2s_weight=trainer_config.s2s_weight,
                            lm_weight=trainer_config.lm_weight,
                            risk_weight=trainer_config.risk_weight,
                            hits_weight=trainer_config.hits_weight,
                            single_input=trainer_config.single_input,
                            n_jobs=trainer_config.n_jobs,
                            clip_grad=trainer_config.clip_grad,
                            device=device,
                            ignore_idxs=tokenizer.all_special_ids,
                            local_rank=args.local_rank,
                            apex_level=None,
                            apex_loss_scale=trainer_config.apex_loss_scale,
                            linear_schedule=trainer_config.linear_schedule,
                            n_epochs=trainer_config.n_epochs,
                            evaluate_full_sequences=trainer_config.evaluate_full_sequences,
                            full_input=trainer_config.full_input,
                            uncertainty_loss=args.uncertainty_loss,
                            best_model_path=best_checkpoint_path,
                            extra_module_lr_rate=args.extra_module_lr_rate,
                            no_persona=args.no_persona)

    if args.load_last:
        state_dict = torch.load(trainer_config.load_last, map_location=device)
        model_trainer.load_state_dict(state_dict)

    # helpers -----------------------------------------------------
    def external_metrics_func(full_references, full_predictions, epoch, metric=None, is_best=False):
        if epoch == -1:
            if is_best:
                references_file_path = os.path.join(writer.logdir, 'test_references_file')
                predictions_file_path = os.path.join(writer.logdir, 'test_predictions_file_best')
            else:
                references_file_path = os.path.join(writer.logdir, 'test_references_file')
                predictions_file_path = os.path.join(writer.logdir, 'test_predictions_file_last')
        else:
            references_file_path = os.path.join(writer.logdir, trainer_config.eval_references_file)
            predictions_file_path = os.path.join(writer.logdir, trainer_config.eval_predictions_file + "_{}".format(epoch))
        if not os.path.exists(references_file_path):
            with open(references_file_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(full_references))
        # print(len(full_predictions))
        with open(os.path.join(writer.logdir, 'tt.json'), 'w') as f:
            json.dump(full_predictions, f)
        with open(predictions_file_path, 'w', encoding='utf-8') as f:
            if len(full_predictions[-1]) == 0:
                full_predictions[-1] = 'a '
            f.write('\n'.join(full_predictions))

        bleu, bleu_list, nist, nist_list, nist_bleu, nist_bleu_list, s_dist, c_dist, entropy, meteor, \
                rouge_l, f1_score, avg_length = nlp_metrics(references_file_path, predictions_file_path, root_path=log_dir)

        metrics = {'meteor': meteor, 'avg_len': avg_length, 'rouge-l': rouge_l, 'bleu': bleu, 'nist': nist,
                   'nist-bleu': nist_bleu, 'f1': f1_score}
        for name, metric in (('bleu', bleu_list), ('nist', nist_list), ('nist_bleu', nist_bleu_list), ('entropy', entropy),
                             ('sentence_div', s_dist), ('corpus_div', c_dist)):
            for i, m in enumerate(metric, 1):
                metrics['{}_{}'.format(name, i)] = m

        return metrics

    def save_func(epoch):
        if epoch != -1:
            torch.save(model_trainer.model.state_dict(), last_checkpoint_path)
            logger.info('Model on Epoch %d has been saved', epoch)

    def sample_text_func(epoch):
        n_samples = 0
        model_trainer.model.eval()
        samples_idxs = random.sample(range(len(valid_dataset)), n_samples)
        samples = [valid_dataset[idx] for idx in samples_idxs]
        for persona_info, dialog, target, _ in samples:
            contexts = [torch.tensor([c], dtype=torch.long, device=model_trainer.device) for c in [persona_info, dialog] if len(c) > 0]
            prediction = model_trainer.model.predict(contexts)[0]

            persona_info_str = tokenizer.ids2string(persona_info[1:-1])
            dialog_str = tokenizer.ids2string(dialog)
            dialog_str = dialog_str.replace(tokenizer.talker1_bos, '\n\t- ').replace(tokenizer.talker2_bos, '\n\t- ')
            dialog_str = dialog_str.replace(tokenizer.talker1_eos, '').replace(tokenizer.talker2_eos, '')
            target_str = tokenizer.ids2string(target[1:-1])
            prediction_str = tokenizer.ids2string(prediction)

            logger.info('\n')
            logger.info('Persona info:\n\t{}'.format(persona_info_str))
            logger.info('Dialog:{}'.format(dialog_str))
            logger.info('Target:\n\t{}'.format(target_str))
            logger.info('Prediction:\n\t{}'.format(prediction_str))

    def test_func(epoch):
        if (epoch+1) % trainer_config.test_period == 0:
            metric_funcs = {'f1_score': f1_score}
            model_trainer.test(metric_funcs, external_metrics_func, epoch)

    def f1_risk(predictions, targets):
        scores = f1_score(predictions, targets, average=False)
        assert all([0 <= s <= 1.0 for s in scores])
        return [1 - s for s in scores]

    def get_risk_metric_func(risk_metric):
        """ risk_metric selected in:
            f1, meteor, avg_len, nist_{1, 2, 3, 4}, entropy_{1, 2, 3, 4}, div_{1, 2}, bleu_{1, 2, 3, 4}
        """
        def external_metric_risk(predictions, targets):
            string_targets = list(tokenizer.ids2string(t) for t in targets)
            string_predictions = list(tokenizer.ids2string(t) for t in predictions)
            metrics = [external_metrics_func([t], [p], epoch=-1, metric=risk_metric) for p, t in zip(string_predictions, string_targets)]

            if any([s in risk_metric for s in ['entropy', 'nist', 'avg_len']]):
                return [-m for m in metrics]

            assert all([0 <= s <= 1.0 for s in metrics]), metrics

            return [1 - m for m in metrics]

        if risk_metric == 'f1':
            return f1_risk

        return external_metric_risk

    # helpers -----------------------------------------------------

    try:
        model_trainer.train(after_epoch_funcs=[save_func, sample_text_func, test_func],
                            risk_func=get_risk_metric_func(trainer_config.risk_metric))
    except (KeyboardInterrupt, Exception, RuntimeError) as e:
        if args.local_rank in [-1, 0]:
            torch.save(model_trainer.state_dict(), interrupt_checkpoint_path)
        raise e
Esempio n. 21
0
def main():
    args = InputConfig().args

    trainer_config = get_trainer_config(args)

    set_seed(trainer_config.seed)
    device = torch.device(trainer_config.device)
    save_path = trainer_config.load_last[:trainer_config.load_last.rfind('/')]
    logger = config_logger(os.path.join(save_path, 'inference.log'))

    parsed_valid_data, parsed_test_data = None, None
    if args.model_type == 'gpt2':
        if args.single_input:
            model = GPT2DoubleHeadsModel.from_pretrained('./gpt2-small')
        else:
            model = GPT2EncoderDecoderModel.from_pretrained('./gpt2-small')
        tokenizer = GPT2Tokenizer.from_pretrained('./gpt2-small')
    elif args.model_type == 'gpt':
        model = OpenAIGPTEncoderDecoderModel.from_pretrained('./openai-gpt')
        tokenizer = OpenAIGPTTokenizer.from_pretrained('./openai-gpt')
    elif args.model_type == 'seq2seq':
        seq2seq_vocab = Seq2seqVocab(trainer_config.train_datasets, trainer_config.valid_datasets,
                                     trainer_config.test_datasets, args.vocab_path, data_type=args.data_type)
        tokenizer = seq2seq_vocab.vocab
        parsed_train_data, parsed_valid_data, parsed_test_data = seq2seq_vocab.all_data[0], seq2seq_vocab.all_data[1], \
                                                                 seq2seq_vocab.all_data[2]
        model = TransformerSeq2Seq(args.emb_dim, args.hidden_dim, args.num_layers, args.heads, args.depth_size,
                                   args.filter_size, tokenizer, args.pretrained_emb_file, args.pointer_gen, logger,
                                   multi_input=not args.single_input, attention_fusion_type=args.attention_fusion_type,
                                   is_eval=True)
        args.dialog_embeddings = False

    model.shared_attention = (args.shared_attention == 1)
    model.shared_module = (args.shared_module == 1)
    model.attention_fusion_type = args.attention_fusion_type
    if args.model_type in ['gpt', 'dialogpt', 'gpt2', 'gpt2_darts']:
        tokenizer, additional_length = modify_tokenizer(tokenizer, args.data_type)
        model.embeddings_size = 768
        model.n_embeddings = len(tokenizer)
        model.shared_attention = (args.shared_attention == 1)
        model.shared_module = (args.shared_module == 1)
        model.attention_fusion_type = args.attention_fusion_type
        model.single_input = args.single_input
        if args.model_type == 'gpt':
            model_embedding_weight = model.transformer.tokens_embed.weight
            model.transformer.tokens_embed = nn.Embedding(model.n_embeddings, 768)
            model.lm_head = nn.Linear(768, model.n_embeddings, bias=False)
            model.transformer.tokens_embed.weight.data[:-additional_length, :] = model_embedding_weight.data
            model.transformer.tokens_embed.weight.data[-additional_length:, :] = 0
            model.lm_head.weight = model.transformer.tokens_embed.weight
        else:
            model_embedding_weight = model.transformer.wte.weight
            model.transformer.wte = nn.Embedding(model.n_embeddings, 768)
            model.lm_head = nn.Linear(768, model.n_embeddings, bias=False)
            model.transformer.wte.weight.data[:-additional_length, :] = model_embedding_weight.data
            model.transformer.wte.weight.data[-additional_length:, :] = 0
            model.lm_head.weight = model.transformer.wte.weight

        if not args.single_input:
            model.reload_module_dict()
        model.sent_dialog_id = tokenizer.sent_dialog_id

    model.padding_idx = tokenizer.pad_id
    model.n_pos_embeddings = 512

    model.talker1_id = tokenizer.talker1_bos_id
    model.talker2_id = tokenizer.talker2_bos_id
    model.bos_id = tokenizer.bos_id
    model.eos_id = tokenizer.eos_id
    model.beam_size = args.beam_size
    model.diversity_groups = 1
    model.max_seq_len = 32
    model.dialog_embeddings = args.dialog_embeddings
    model.bs_temperature = args.bs_temperature
    model.bs_nucleus_p = args.bs_nucleus_p
    model.annealing_topk = args.annealing_topk
    model.length_penalty_coef = args.length_penalty
    model.vocab = None
    model.annealing = args.annealing
    model.diversity_coef = args.diversity_coef
    model.sample = False
    model.inference_mode = args.inference_mode
    model.response_k = args.response_k

    logger.info('loading datasets')
    valid_dataset = FacebookDataset(trainer_config.valid_datasets, tokenizer,
                                   max_lengths=(model.n_pos_embeddings - 1) // (3 if args.single_input else 1),  # A bit restrictive here
                                   dialog_embeddings=args.dialog_embeddings,
                                   cache=trainer_config.valid_datasets_cache,
                                   use_start_end=args.use_start_end,
                                   negative_samples=0,  # Keep all negative samples
                                   augment=False,
                                   aug_syn_proba=0.0,
                                   limit_size=trainer_config.limit_eval_size,
                                   single_input=args.single_input,
                                   data_type=args.data_type,
                                   parsed_data=parsed_valid_data)
    test_dataset = FacebookDataset(trainer_config.test_datasets, tokenizer,
                                   max_lengths=(model.n_pos_embeddings - 1) // (3 if args.single_input else 1),  # A bit restrictive here
                                   dialog_embeddings=args.dialog_embeddings,
                                   cache=trainer_config.test_datasets_cache,
                                   use_start_end=args.use_start_end,
                                   negative_samples=0,  # Keep all negative samples
                                   augment=False,
                                   aug_syn_proba=0.0,
                                   limit_size=trainer_config.limit_eval_size,
                                   single_input=args.single_input,
                                   data_type=args.data_type,
                                   parsed_data=parsed_test_data)
    logger.info(f'valid dataset {len(valid_dataset)} test dataset {(len(test_dataset))}')

    model.to(device)
    logger.info('Weights loaded from {}'.format(trainer_config.load_last))

    trainer = Trainer(model,
                      valid_dataset,
                      None,
                      logger=logger,
                      valid_dataset=valid_dataset,
                      test_dataset=test_dataset,
                      train_batch_size=trainer_config.train_batch_size,
                      batch_split=trainer_config.batch_split,
                      test_batch_size=trainer_config.test_batch_size,
                      single_input=args.single_input,
                      n_jobs=trainer_config.n_jobs,
                      clip_grad=trainer_config.clip_grad,
                      device=device,
                      ignore_idxs=tokenizer.all_special_ids,
                      local_rank=args.local_rank,
                      apex_level=None,
                      apex_loss_scale=trainer_config.apex_loss_scale,
                      linear_schedule=trainer_config.linear_schedule,
                      n_epochs=trainer_config.n_epochs,
                      evaluate_full_sequences=trainer_config.evaluate_full_sequences,
                      full_input=trainer_config.full_input,
                      uncertainty_loss=args.uncertainty_loss)

    def external_metrics_func(full_references, full_predictions, epoch, metric=None):
        if epoch == -1:
            references_file_path = os.path.join(save_path, 'test_references_file.txt')
            predictions_file_path = os.path.join(save_path, 'test_predictions_file.txt')
        else:
            references_file_path = os.path.join(save_path, 'eval_references_file.txt')
            predictions_file_path = os.path.join(save_path, 'eval_predictions_file.txt')
        with open(references_file_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(full_references))
        with open(predictions_file_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(full_predictions))

        bleu, bleu_list, nist, nist_list, nist_bleu, nist_bleu_list, s_dist, c_dist, entropy, meteor, \
        rouge_l, f1_score, avg_length = nlp_metrics(references_file_path, predictions_file_path)

        metrics = {'meteor': meteor, 'avg_len': avg_length, 'rouge-l': rouge_l, 'bleu': bleu, 'nist': nist,
                   'nist-bleu': nist_bleu, 'f1': f1_score}
        for name, metric in (
        ('bleu', bleu_list), ('nist', nist_list), ('nist_bleu', nist_bleu_list), ('entropy', entropy),
        ('sentence_div', s_dist), ('corpus_div', c_dist)):
            for i, m in enumerate(metric, 1):
                metrics['{}_{}'.format(name, i)] = m

        return metrics

    metric_funcs = {'f1_score': f1_score}
    # trainer.test(metric_funcs, external_metrics_func, epoch=0, inference=True)
    trainer.test(metric_funcs, external_metrics_func, epoch=-1, inference=True)
Esempio n. 22
0
#!/usr/bin/env python3

from transformers.tokenization_gpt2 import GPT2Tokenizer
from transformers.modeling_gpt2 import GPT2LMHeadModel
import torch

tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token='<|endoftext|>')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Complete phrases are: "I like to drink soda without sugar" and "Go watch TV alone, I am not going"
docs = ["I like to drink soda", "Go watch TV"]
docs_tensors = tokenizer.batch_encode_plus([d for d in docs],
                                           pad_to_max_length=True,
                                           return_tensors='pt')

docs_next = ["without sugar", "alone, I am not going"]
docs_next_tensors = tokenizer.batch_encode_plus([d for d in docs_next],
                                                pad_to_max_length=True,
                                                return_tensors='pt')

# predicting the first part of each phrase
_, past = model(docs_tensors['input_ids'],
                attention_mask=docs_tensors['attention_mask'])

# predicting the rest of the phrase
attn_mask = torch.cat(
    [docs_tensors['attention_mask'], docs_next_tensors['attention_mask']],
    dim=-1)
logits, _ = model(docs_next_tensors['input_ids'],
                  attention_mask=attn_mask,
                  past=past)
Esempio n. 23
0
	def _load_PretrainedVocab():
		vocab_file = './tests/dataloader/dummy_gpt2vocab/vocab.json'
		merges_file = './tests/dataloader/dummy_gpt2vocab/merges.txt'
		from transformers.tokenization_gpt2 import GPT2Tokenizer
		return PretrainedVocab(GPT2Tokenizer(vocab_file, merges_file, unk_token='<|endoftext|>'))