Exemple #1
0
    def __init__(self, args, dictionary, left_pad=False):
        super().__init__(dictionary)
        self.dropout = args.dropout

        from pytorch_transformers import RobertaModel, BertModel
        from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE
        from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer

        if args.pretrained_bert_model.startswith('roberta'):
            self.embed = RobertaModel.from_pretrained(
                args.pretrained_bert_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            # self.context = RobertaModel.from_pretrained(args.pretrained_bert_model,
            #         cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank))
            self.config = RobertaConfig.from_pretrained(
                args.pretrained_bert_model)
            self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        else:
            self.embed = BertModel.from_pretrained(
                args.pretrained_bert_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            # self.context = BertModel.from_pretrained(args.pretrained_bert_model,
            #         cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank))
            self.config = BertConfig.from_pretrained(
                args.pretrained_bert_model)

            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.padding_idx = self.tokenizer.convert_tokens_to_ids(
            self.tokenizer.pad_token)
Exemple #2
0
    def _roberta(self, text, unit="text"):
        """
        ex)
        """
        if self.bpe_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"],
                                                return_path=True)
            merges_path = self.data_handler.read(self.config["merges_path"],
                                                 return_path=True)
            del self.config["vocab_path"]
            del self.config["merges_path"]

            self.bpe_tokenizer = RobertaTokenizer(vocab_path, merges_path,
                                                  **self.config)

        return self.bpe_tokenizer._tokenize(text)
def get_tokenizer(tokenizer_name):
    log.info(f"\tLoading Tokenizer {tokenizer_name}")
    if tokenizer_name.startswith("bert-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        # TransformerXL is trained on data pretokenized with MosesTokenizer
        tokenizer = MosesTokenizer()
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name == "MosesTokenizer":
        tokenizer = MosesTokenizer()
    elif tokenizer_name == "SplitChars":
        tokenizer = SplitCharsTokenizer()
    elif tokenizer_name == "":
        tokenizer = SpaceTokenizer()
    else:
        tokenizer = None
    return tokenizer
Exemple #4
0
 def __init__(self, pretrain_path, max_length): 
     nn.Module.__init__(self)
     self.bert = RobertaForSequenceClassification.from_pretrained(pretrain_path, num_labels=2)
     #self.bert = RobertaModel.from_pretrained(pretrain_path)
     self.max_length = max_length
     self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
     self.modelName = 'Roberta'
Exemple #5
0
class BPETokenizer(Tokenizer):
    """
    BPTE(Byte-Pair Encoding) Tokenizer
    text -> ...
    * Args:
        name: tokenizer name [roberta]
    """
    def __init__(self, name, config={}):
        super(BPETokenizer, self).__init__(name, f"bpe-{name}")
        self.data_handler = DataHandler(CachePath.VOCAB)
        self.config = config

        self.bpe_tokenizer = None

    """ Tokenizers """

    def _roberta(self, text, unit="text"):
        """
        ex)
        """
        if self.bpe_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"],
                                                return_path=True)
            merges_path = self.data_handler.read(self.config["merges_path"],
                                                 return_path=True)
            del self.config["vocab_path"]
            del self.config["merges_path"]

            self.bpe_tokenizer = RobertaTokenizer(vocab_path, merges_path,
                                                  **self.config)

        return self.bpe_tokenizer._tokenize(text)
Exemple #6
0
        def check_iterator(pretrained_model, file, max_seq_length=None):
            batch_size = 10
            name = pretrained_model.split('-')[0].lower()
            tokenizer = RobertaTokenizer.from_pretrained(
                pretrained_model
            ) if name == 'roberta' else BertTokenizer.from_pretrained(
                pretrained_model)
            reader = BertNLIDatasetReader(pretrained_model,
                                          lazy=True,
                                          percent_data=0.001,
                                          max_seq_length=max_seq_length)
            iterator = BasicIterator(batch_size=batch_size,
                                     max_instances_in_memory=10000)

            for batch_dict in iterator(reader.read(file), num_epochs=1):
                assert batch_dict['input_ids'].size(
                ) == batch_dict['token_type_ids'].size(
                ) == batch_dict['attention_mask'].size()

                for idx in range(batch_dict['input_ids'].size(0)):
                    input_ids = batch_dict['input_ids'][idx].numpy().tolist()
                    token_type_ids = batch_dict['token_type_ids'][idx]
                    attention_mask = batch_dict['attention_mask'][idx]
                    premise = batch_dict['metadata'][idx]['premise_tokens']
                    hypothesis = batch_dict['metadata'][idx][
                        'hypothesis_tokens']

                    num_extra_tokens = 3 if name == 'bert' else 4
                    num_input_ids = len(premise) + len(
                        hypothesis) + num_extra_tokens

                    # Check input ids
                    if name == 'bert':
                        assert input_ids[:
                                         num_input_ids] == tokenizer.convert_tokens_to_ids(
                                             ['[CLS]'] + premise + ['[SEP]'] +
                                             hypothesis + ['[SEP]'])

                        segment_divide = len(premise) + 2
                        assert input_ids[:segment_divide][-1] == 102
                        assert torch.sum(token_type_ids[:segment_divide]) == 0
                        assert torch.sum(
                            token_type_ids[segment_divide:num_input_ids]
                        ) == num_input_ids - segment_divide
                        assert torch.sum(token_type_ids[num_input_ids:]) == 0
                    else:
                        assert input_ids[:
                                         num_input_ids] == tokenizer.convert_tokens_to_ids(
                                             ['<s>'] + premise + ['</s>'] * 2 +
                                             hypothesis + ['</s>'])

                    # Check attention mask
                    assert torch.sum(
                        attention_mask[:num_input_ids]).item() == num_input_ids
                    assert torch.sum(
                        attention_mask[num_input_ids:]).item() == 0
    def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True):
        super(RoBERTa, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.max_seq_length = max_seq_length
        self.do_lower_case = do_lower_case

        self.roberta = RobertaModel.from_pretrained(model_name_or_path)
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
        self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
        self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
Exemple #8
0
def add_pytorch_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in pytorch_transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name

    if tokenizer_name.startswith("bert-"):
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token
    if tokenizer_name.startswith("roberta-"):
        if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None:
            vocab_size -= 1
        else:
            log.info("Time to delete vocab_size-1 in preprocess.py !!!")
    # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove
    # this when they fix the problem

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added pytorch_transformers vocab (%s): %d tokens",
             tokenizer_name, len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
Exemple #9
0
 def __init__(self, model):
     super().__init__()
     if 'roberta' in model:
         print("Roberta model: {}".format(model))
         self.tokenizer = RobertaTokenizer.from_pretrained(model)
         self.bert = RobertaModel.from_pretrained(model)
     else:
         print("Bert model: {}".format(model))
         self.tokenizer = BertTokenizer.from_pretrained(model)
         self.bert = BertModel.from_pretrained(model)
     self.dim = self.bert.pooler.dense.in_features
     self.max_len = self.bert.embeddings.position_embeddings.num_embeddings
     
     if use_cuda:
         self.cuda()
Exemple #10
0
    def __init__(self, opt):
        self.opt = opt
        if 'roberta' in opt.pretrained_bert_name:
            tokenizer = RobertaTokenizer.from_pretrained(
                opt.pretrained_bert_name)
            transformer = RobertaModel.from_pretrained(
                opt.pretrained_bert_name, output_attentions=True)
        elif 'bert' in opt.pretrained_bert_name:
            tokenizer = BertTokenizer.from_pretrained(opt.pretrained_bert_name)
            transformer = BertModel.from_pretrained(opt.pretrained_bert_name,
                                                    output_attentions=True)
        elif 'xlnet' in opt.pretrained_bert_name:
            tokenizer = XLNetTokenizer.from_pretrained(
                opt.pretrained_bert_name)
            transformer = XLNetModel.from_pretrained(opt.pretrained_bert_name,
                                                     output_attentions=True)
        if 'bert' or 'xlnet' in opt.model_name:
            tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len)
            self.model = opt.model_class(transformer, opt).to(opt.device)
        # elif 'xlnet' in opt.model_name:
        #     tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len)
        #     self.model = opt.model_class(bert,opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)
        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(
                self.trainset, (len(self.trainset) - valset_len, valset_len))
        else:
            self.valset = self.testset

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
Exemple #11
0
 def __init__(self,
              train_file,
              valid_file,
              test_file,
              dict_path,
              bert_path,
              bert_name,
              batch_size=4,
              seq_length=20,
              negative_sample_rate=1.0,
              negative_sample_size=25):
     self.batch_size = batch_size
     self.negative_sampling_rate = negative_sample_rate
     self.negative_sampling_size = negative_sample_size
     self.seq_length = seq_length
     self.ent_dict = {}
     if bert_name == 'roberta-base':
         self.tokenizer = RobertaTokenizer.from_pretrained(bert_path +
                                                           bert_name)
     elif bert_name == 'bert-base-uncased':
         self.tokenizer = BertTokenizer.from_pretrained(bert_path +
                                                        bert_name)
     logger.info('reading entity dict...')
     with open(dict_path) as f:
         for line in f:
             entity, entity_id = line.strip('\n').split('\t')
             self.ent_dict[entity] = int(entity_id)
     logger.info('there are {} entities'.format(len(self.ent_dict)))
     self.train_corpus = self.read_file(train_file)[:96106]
     self.total_train_instances = 96106
     random.shuffle(self.train_corpus)
     logger.info('there are {} instances in {}'.format(
         self.total_train_instances, train_file))
     self.valid_corpus = self.read_file(valid_file)
     self.total_valid_instances = len(self.valid_corpus)
     random.shuffle(self.valid_corpus)
     logger.info('there are {} instances in {}'.format(
         self.total_valid_instances, valid_file))
     self.test_corpus = self.read_file(test_file)
     self.total_test_instances = len(self.test_corpus)
     random.shuffle(self.test_corpus)
     logger.info('there are {} instances in {}'.format(
         self.total_test_instances, test_file))
Exemple #12
0
        def check_tokenizer_dataset(pretrained_model, file):
            name = pretrained_model.split('-')[0].lower()
            assert name in ['bert', 'roberta']
            tokenizer = RobertaTokenizer.from_pretrained(
                pretrained_model
            ) if name == 'roberta' else BertTokenizer.from_pretrained(
                pretrained_model)
            reader = BertNLIDatasetReader(pretrained_model,
                                          lazy=True,
                                          percent_data=0.001)

            for instance in reader.read(file):
                input_ids = instance['input_ids'].array.tolist()
                token_type_ids = instance['token_type_ids'].array
                attention_mask = instance['attention_mask'].array
                premise_tokens = instance['metadata'].metadata[
                    'premise_tokens']
                hypothesis_tokens = instance['metadata'].metadata[
                    'hypothesis_tokens']

                assert len(input_ids) == len(token_type_ids) == len(
                    attention_mask)
                assert attention_mask.all() == 1
                assert reader._label_dict[instance['metadata'].metadata[
                    'label']] == instance['label'].array

                if name == 'bert':
                    tokenizer.convert_tokens_to_ids(['[CLS]'] +
                                                    premise_tokens +
                                                    ['[SEP]'] +
                                                    hypothesis_tokens +
                                                    ['[SEP]']) == input_ids
                    segment_divide = len(premise_tokens) + 2
                    assert input_ids[:segment_divide][-1] == 102
                    assert token_type_ids[:segment_divide].all() == 0
                    assert token_type_ids[segment_divide:].all() == 1
                elif name == 'roberta':
                    assert tokenizer.convert_tokens_to_ids(
                        ['<s>'] + premise_tokens + ['</s>', '</s>'] +
                        hypothesis_tokens + ['</s>']) == input_ids
Exemple #13
0
    def __init__(self,
                 model_name_or_path: str,
                 max_seq_length: int = 128,
                 do_lower_case: bool = True):
        super(RoBERTa, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if max_seq_length > 510:
            logging.warning(
                "RoBERTa only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510"
            )
            max_seq_length = 510
        self.max_seq_length = max_seq_length

        self.roberta = RobertaModel.from_pretrained(model_name_or_path)
        self.tokenizer = RobertaTokenizer.from_pretrained(
            model_name_or_path, do_lower_case=do_lower_case)
        self.cls_token_id = self.tokenizer.convert_tokens_to_ids(
            [self.tokenizer.cls_token])[0]
        self.sep_token_id = self.tokenizer.convert_tokens_to_ids(
            [self.tokenizer.sep_token])[0]
Exemple #14
0
    def __init__(self,
                 pretrained_model,
                 percent_data=1,
                 max_seq_length=None,
                 lazy=False) -> None:
        super().__init__(lazy)
        assert percent_data > 0 and percent_data <= 1
        self.percent_data = percent_data
        self.max_seq_length = max_seq_length
        self.tokenizer_class = pretrained_model.split('-')[0].lower()

        if self.tokenizer_class == 'roberta':
            self._tokenizer = RobertaTokenizer.from_pretrained(
                pretrained_model)
        elif self.tokenizer_class == 'bert':
            self._tokenizer = BertTokenizer.from_pretrained(pretrained_model,
                                                            do_lower_case=True)
        else:
            raise ValueError('tokenizer_model must either be roberta or bert')

        self.sep_id = self._tokenizer.encode(self._tokenizer.sep_token)[0]
        self._label_dict = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
Exemple #15
0
##########################
# Utility functions      #
##########################

import torch
import time, sys
from pytorch_transformers import RobertaTokenizer, BertTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased')


def _truncate_seq_pair(tokens_a, max_length):
    """Truncates a sequence pair in place to the maximum length.
    Copyed from https://github.com/huggingface/pytorch-pretrained-BERT
    """
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a)
        if total_length <= max_length:
            break
        tokens_a.pop()


def get_BERT_vector(sent1,
                    sent2=None,
                    max_sent1_len=400,
                    max_sent2_len=100,
Exemple #16
0
    def __init__(self, args):
        super().__init__()

        if args.hfroberta_model_dir is not None:
            # load bert model from file
            roberta_model_name = str(args.hfroberta_model_dir) + "/"
            dict_file = roberta_model_name
            print("loading huggingface RoBERTa model from {}".format(roberta_model_name))
        else:
            # load RoBERTa model from huggingface cache
            roberta_model_name = args.hfroberta_model_name
            dict_file = roberta_model_name

        # When using a cased model, make sure to pass do_lower_case=False directly to BaseTokenizer
        do_lower_case = False
        if 'uncased' in roberta_model_name:
            do_lower_case=True

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = RobertaTokenizer.from_pretrained(dict_file)

        # original vocab

        # The following process is baded on gpt_connector.

        # RoBERTa also uses BPE. the bytes_to_unicode function takes all control
        # and whitespace characters in code points 0-255 and shifts them up
        # by 256 to make them printable. So space (code point 32) becomes Ġ (code point 288).
        # (copied from https://github.com/openai/gpt-2/issues/80#issuecomment-487202159).
        #
        # Other control characters will be removed during voca_intersection process.
        def convert_word(word):
            if word == ROBERTA_UNK:
                return word
            if word == ROBERTA_MASK:
                return word
            if word == ROBERTA_START_SENTENCE:
                return word
            if word == ROBERTA_END_SENTENCE:
                return word
            if word == ROBERTA_PAD:
                return word

            if word.startswith('Ġ'):  # the token starts with a whitespace
                return word[1:]
            
            return f'_{word}_'  # the token not start with a white space.
                                # may be not a head of a word,
                                # or may be a head of a sentence.

            # need duplitation check?

        _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items()))
        self.vocab = [convert_word(word) for word in gpt_vocab]
        self._init_inverse_vocab()

        # Get UNK symbol as it's written in the origin RoBERTa vocab.
        unk_index = self.inverse_vocab[ROBERTA_UNK]  # OPENAI_UNK
        self.unk_symbol = self.tokenizer.decoder[unk_index]

        # Get MASK symbol as it's written in the origin RoBERTa vocab.
        mask_index = self.inverse_vocab[ROBERTA_MASK]
        self.mask_symbol = self.tokenizer.decoder[mask_index]

        # Load pre-trained model (weights)
        self.masked_roberta_model = RobertaForMaskedLM.from_pretrained(roberta_model_name)
        self.masked_roberta_model.eval()
        #print(self.masked_roberta_model.config)

        # ... to get hidden states
        self.roberta_model = self.masked_roberta_model.roberta

        # Sanity check.
        #assert len(self.vocab) == self.masked_roberta_model.config.vocab_size
        #assert 0 == self.masked_roberta_model.config.n_special

        self.eos_id = self.inverse_vocab[ROBERTA_END_SENTENCE]  # OPENAI_EOS
        self.model_vocab = self.vocab

        self.pad_id = self.inverse_vocab[ROBERTA_PAD]
        self.unk_index = self.inverse_vocab[ROBERTA_UNK]
        self.mask_index = mask_index
Exemple #17
0
 def __init__(self, max_seq_len, pretrained_roberta_name):
     self.tokenizer = RobertaTokenizer.from_pretrained(
         pretrained_roberta_name)
     self.max_seq_len = max_seq_len
Exemple #18
0
def main():
    args = parse_arguments()
    # argument setting
    print("=== Argument Setting ===")
    print("src: " + args.src)
    print("tgt: " + args.tgt)
    print("alpha: " + str(args.alpha))
    print("seed: " + str(args.seed))
    print("train_seed: " + str(args.train_seed))
    print("model_type: " + str(args.model))
    print("max_seq_length: " + str(args.max_seq_length))
    print("batch_size: " + str(args.batch_size))
    print("num_epochs: " + str(args.num_epochs))
    set_seed(args.train_seed)

    if args.model == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # preprocess data
    print("=== Processing datasets ===")
    if args.src == 'blog':
        src_x, src_y = CSV2Array(os.path.join('data', args.src, 'blog.csv'))

    elif args.src == 'airline':
        src_x, src_y = CSV2Array(os.path.join('data', args.src, 'airline.csv'))

    else:
        src_x, src_y = XML2Array(
            os.path.join('data', args.src, 'negative.review'),
            os.path.join('data', args.src, 'positive.review'))

    src_x, src_test_x, src_y, src_test_y = train_test_split(
        src_x, src_y, test_size=0.2, stratify=src_y, random_state=args.seed)

    if args.tgt == 'blog':
        tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'blog.csv'))

    elif args.tgt == 'airline':
        tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'airline.csv'))
    else:
        tgt_x, tgt_y = XML2Array(
            os.path.join('data', args.tgt, 'negative.review'),
            os.path.join('data', args.tgt, 'positive.review'))

    tgt_train_x, _, tgt_train_y, _ = train_test_split(tgt_x,
                                                      tgt_y,
                                                      test_size=0.2,
                                                      stratify=tgt_y,
                                                      random_state=args.seed)

    if args.model == 'roberta':
        src_features = roberta_convert_examples_to_features(
            src_x, src_y, args.max_seq_length, tokenizer)
        src_test_features = roberta_convert_examples_to_features(
            src_test_x, src_test_y, args.max_seq_length, tokenizer)
        tgt_features = roberta_convert_examples_to_features(
            tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer)
        tgt_all_features = roberta_convert_examples_to_features(
            tgt_x, tgt_y, args.max_seq_length, tokenizer)
    else:
        src_features = convert_examples_to_features(src_x, src_y,
                                                    args.max_seq_length,
                                                    tokenizer)
        src_test_features = convert_examples_to_features(
            src_test_x, src_test_y, args.max_seq_length, tokenizer)
        tgt_features = convert_examples_to_features(tgt_train_x, tgt_train_y,
                                                    args.max_seq_length,
                                                    tokenizer)
        tgt_all_features = convert_examples_to_features(
            tgt_x, tgt_y, args.max_seq_length, tokenizer)

    # load dataset

    src_data_loader = get_data_loader(src_features, args.batch_size)
    src_data_loader_eval = get_data_loader(src_test_features, args.batch_size)
    tgt_data_loader = get_data_loader(tgt_features, args.batch_size)
    tgt_data_loader_all = get_data_loader(tgt_all_features, args.batch_size)

    # load models
    if args.model == 'bert':
        encoder = BertEncoder()
        cls_classifier = BertClassifier()
        dom_classifier = DomainClassifier()
    elif args.model == 'distilbert':
        encoder = DistilBertEncoder()
        cls_classifier = BertClassifier()
        dom_classifier = DomainClassifier()
    else:
        encoder = RobertaEncoder()
        cls_classifier = RobertaClassifier()
        dom_classifier = RobertaDomainClassifier()

    if args.load:
        encoder = init_model(encoder, restore=param.encoder_path)
        cls_classifier = init_model(cls_classifier,
                                    restore=param.cls_classifier_path)
        dom_classifier = init_model(dom_classifier,
                                    restore=param.dom_classifier_path)
    else:
        encoder = init_model(encoder)
        cls_classifier = init_model(cls_classifier)
        dom_classifier = init_model(dom_classifier)

    print("=== Start Training ===")
    if args.train:
        encoder, cls_classifier, dom_classifier = train(
            args, encoder, cls_classifier, dom_classifier, src_data_loader,
            src_data_loader_eval, tgt_data_loader, tgt_data_loader_all)

    print("=== Evaluating classifier for encoded target domain ===")
    print(">>> after training <<<")
    evaluate(encoder, cls_classifier, tgt_data_loader_all)
def test_roberta_embeddings():
    roberta_model: str = "roberta-base"

    tokenizer = RobertaTokenizer.from_pretrained(roberta_model)
    model = RobertaModel.from_pretrained(
        pretrained_model_name_or_path=roberta_model, output_hidden_states=True
    )
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize("<s> " + s + " </s>")

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]

        first_layer = hidden_states[1][0]

    assert len(first_layer) == len(tokens)

    #         0           1      2       3        4         5       6      7      8       9      10     11     12     13    14      15
    #
    #       '<s>',      'Ber', 'lin', 'Ġand', 'ĠMunich', 'Ġhave', 'Ġa', 'Ġlot', 'Ġof', 'Ġpupp', 'ete', 'er', 'Ġto', 'Ġsee', 'Ġ.',  '</s>'
    #                      \     /       |        |         |       |      |      |         \      |      /     |      |      |
    #                       Berlin      and    Munich     have      a     lot     of           puppeteer        to    see     .
    #
    #                         0          1        2         3       4      5       6               7             8     9      10

    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = RoBERTaEmbeddings(
            pretrained_model_name_or_path=roberta_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence

    # First subword embedding
    sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first")

    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[0].embedding.tolist()

    puppeteer_first_subword_embedding_ref = first_layer[9].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7
    ].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (
        puppeteer_first_subword_embedding_ref
        == puppeteer_first_subword_embedding_actual
    )

    # Last subword embedding
    sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last")

    # First token is splitted into two subwords.
    # As we use "last" as pooling operation, we consider the last subword as "first token" here
    first_token_embedding_ref = first_layer[2].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[0].embedding.tolist()

    puppeteer_last_subword_embedding_ref = first_layer[11].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7
    ].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (
        puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual
    )

    # First and last subword embedding
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation="first_last"
    )

    first_token_embedding_ref = torch.cat([first_layer[1], first_layer[2]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0
    ].embedding.tolist()

    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[9], first_layer[11]]
    ).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7
    ].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (
        puppeteer_first_last_subword_embedding_ref
        == puppeteer_first_last_subword_embedding_actual
    )

    # Mean of all subword embeddings
    sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean")

    first_token_embedding_ref = calculate_mean_embedding(
        [first_layer[1], first_layer[2]]
    ).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[0].embedding.tolist()

    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[9], first_layer[10], first_layer[11]]
    ).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7
    ].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (
        puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual
    )

    # Check embedding dimension when using multiple layers
    sentence_mult_layers = embed_sentence(
        sentence="Munich", pooling_operation="first", layers="1,2,3,4"
    )

    ref_embedding_size = 4 * 768
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size

    # Check embedding dimension when using multiple layers and scalar mix
    sentence_mult_layers_scalar_mix = embed_sentence(
        sentence="Berlin",
        pooling_operation="first",
        layers="1,2,3,4",
        use_scalar_mix=True,
    )

    ref_embedding_size = 1 * 768
    actual_embedding_size = len(sentence_mult_layers_scalar_mix.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size
Exemple #20
0

print("Build bert model.")
bert_model = RobertaModel(RobertaConfig().from_pretrained(args.roberta_model))
print("Build Drop model.")
network = NumericallyAugmentedBertNet(bert_model,
                hidden_size=bert_model.config.hidden_size,
                dropout_prob=0.0,
                use_gcn=args.use_gcn,
                gcn_steps=args.gcn_steps)

if args.cuda: network.cuda()
print("Load from pre path {}.".format(args.pre_path))
network.load_state_dict(torch.load(args.pre_path))

print("Load data from {}.".format(args.inf_path))
tokenizer = RobertaTokenizer.from_pretrained(args.roberta_model)
inf_iter = DropBatchGen(args, tokenizer, DropReader(tokenizer, passage_length_limit=463, question_length_limit=46)._read(args.inf_path))

print("Start inference...")
result = {}
network.eval()
with torch.no_grad():
    for batch in tqdm(inf_iter):
        output_dict = network(**batch)
        for i in range(len(output_dict["question_id"])):
            result[output_dict["question_id"][i]] =  output_dict["answer"][i]["predicted_answer"]

with open(args.dump_path, "w", encoding="utf8") as f:
    json.dump(result, f)
Exemple #21
0
 def __init__(self, pretrained_model_name, max_seq_len):
     super().__init__()
     self.tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name)
     self.max_seq_len = max_seq_len
     self.pad_value = 1  # <pad> https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json
    def make_binary_dataset(input_prefix, output_prefix, lang, append_eos=False):
        if lang == args.target_lang:
            dict = flexible_dictionary.FlexibleDictionary.load(dict_path(lang))
        else:
            # dict = bert_dictionary.BertDictionary.load(dict_path(lang))
            dict = gpt2_dictionary.GPT2Dictionary.load(dict_path(lang))

        print('| [{}] Dictionary: {} types | {} types (for real)'.format(lang, len(dict) - 1, len(dict)))

        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin'))

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')
        if lang == args.target_lang:
            res = Tokenizer.binarize(input_file, dict, consumer, append_eos=append_eos)
            print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
                lang, input_file, res['nseq'], res['ntok'],
                100 * res['nunk'] / res['ntok'], dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))
        else:
            # read article
            # from pytorch_pretrained_bert.tokenization import BertTokenizer
            # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
            from pytorch_transformers import RobertaTokenizer
            tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

            def penn_token2orig_token(sent):
                # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB-
                '''
                penn2orig = {"``":'"', "''": '"',
                             "-LRB-": '(', "-RRB-": ')',
                             "-LSB-":'[', "-RSB-":']',
                             "-LCB-":'{', "-RCB-":'}'}
                '''
                penn2orig = {"-LRB-": '(', "-RRB-": ')',
                             "-LSB-": '[', "-RSB-": ']',
                             "-LCB-": '{', "-RCB-": '}',
                             "-lrb-": '(', "-rrb-": ')',
                             "-lsb-": '[', "-rsb-": ']',
                             "-lcb-": '{', "-rcb-": '}',}
                words = sent.strip().split()
                words = [wd if not wd in penn2orig else penn2orig[wd] for wd in words]
                return ' '.join(words)

            num_token, num_unk_token = 0, 0
            num_seq = 0
            skip_line = 0
            for line in open(input_file, encoding='utf8'):
                sents = line.strip().split('<S_SEP>')
                sents = sents[0:args.max_num_sentences]
                sents = [' '.join(sent.strip().split()[0:args.max_num_words]) for sent in sents]
                # print(sents)
                sents = [tokenizer.tokenize(penn_token2orig_token(sent)) for sent in sents]
                article_wids = []
                for i, sent in enumerate(sents):
                    # sometimes there are too many tokens
                    MAXLEN = 500
                    if len(sent) > MAXLEN:
                        # sent = sent[0:MAXLEN]
                        print(' '.join(sent))
                        skip_line += 1
                        print(skip_line)
                        continue
                    if i != 0:
                        article_wids.append( dict.sep_index )
                    wids = tokenizer.convert_tokens_to_ids(sent)
                    # wids_vocab = [dict.index(word) for word in sent]
                    # assert wids == wids_vocab, 'word indices should be the same!'
                    article_wids.extend(wids)
                    for wid in wids:
                        if wid == dict.unk_index:
                            num_unk_token += 1
                        num_token += 1

                num_seq += 1
                tensor = torch.IntTensor(article_wids)
                # print( dict.string_complete(tensor) )
                ds.add_item(tensor)

            print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
                lang, input_file, num_seq, num_token,
                100 * num_unk_token / num_token, dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))

        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))
Exemple #23
0
def main():
    parser = argparse.ArgumentParser(description="Training")

    parser.add_argument(
        "--dump_path",
        type=str,
        required=True,
        help="The output directory (log, checkpoints, parameters, etc.)")
    parser.add_argument(
        "--data_file",
        type=str,
        required=True,
        help=
        "The binarized file (tokenized + tokens_to_ids) and grouped by sequence."
    )
    parser.add_argument("--token_counts",
                        type=str,
                        required=True,
                        help="The token counts in the data_file for MLM.")
    parser.add_argument("--force",
                        action='store_true',
                        help="Overwrite dump_path if it already exists.")

    parser.add_argument("--vocab_size",
                        default=30522,
                        type=int,
                        help="The vocabulary size.")
    parser.add_argument(
        "--max_position_embeddings",
        default=512,
        type=int,
        help="Maximum sequence length we can model (including [CLS] and [SEP])."
    )
    parser.add_argument(
        "--sinusoidal_pos_embds",
        action='store_false',
        help=
        "If true, the position embeddings are simply fixed with sinusoidal embeddings."
    )
    parser.add_argument("--n_layers",
                        default=6,
                        type=int,
                        help="Number of Transformer blocks.")
    parser.add_argument("--n_heads",
                        default=12,
                        type=int,
                        help="Number of heads in the self-attention module.")
    parser.add_argument(
        "--dim",
        default=768,
        type=int,
        help="Dimension through the network. Must be divisible by n_heads")
    parser.add_argument("--hidden_dim",
                        default=3072,
                        type=int,
                        help="Intermediate dimension in the FFN.")
    parser.add_argument("--dropout", default=0.1, type=float, help="Dropout.")
    parser.add_argument("--attention_dropout",
                        default=0.1,
                        type=float,
                        help="Dropout in self-attention.")
    parser.add_argument("--activation",
                        default='gelu',
                        type=str,
                        help="Activation to use in self-attention")
    parser.add_argument(
        "--tie_weights_",
        action='store_false',
        help=
        "If true, we tie the embeddings matrix with the projection over the vocabulary matrix. Default is true."
    )

    parser.add_argument("--from_pretrained_weights",
                        default=None,
                        type=str,
                        help="Load student initialization checkpoint.")
    parser.add_argument(
        "--from_pretrained_config",
        default=None,
        type=str,
        help="Load student initialization architecture config.")
    parser.add_argument("--teacher_type",
                        default="bert",
                        choices=["bert", "roberta"],
                        help="Teacher type (BERT, RoBERTa).")
    parser.add_argument("--teacher_name",
                        default="bert-base-uncased",
                        type=str,
                        help="The teacher model.")

    parser.add_argument("--temperature",
                        default=2.,
                        type=float,
                        help="Temperature for the softmax temperature.")
    parser.add_argument(
        "--alpha_ce",
        default=0.5,
        type=float,
        help="Linear weight for the distillation loss. Must be >=0.")
    parser.add_argument("--alpha_mlm",
                        default=0.5,
                        type=float,
                        help="Linear weight for the MLM loss. Must be >=0.")
    parser.add_argument("--alpha_mse",
                        default=0.0,
                        type=float,
                        help="Linear weight of the MSE loss. Must be >=0.")
    parser.add_argument(
        "--alpha_cos",
        default=0.0,
        type=float,
        help="Linear weight of the cosine embedding loss. Must be >=0.")
    parser.add_argument(
        "--mlm_mask_prop",
        default=0.15,
        type=float,
        help="Proportion of tokens for which we need to make a prediction.")
    parser.add_argument("--word_mask",
                        default=0.8,
                        type=float,
                        help="Proportion of tokens to mask out.")
    parser.add_argument("--word_keep",
                        default=0.1,
                        type=float,
                        help="Proportion of tokens to keep.")
    parser.add_argument("--word_rand",
                        default=0.1,
                        type=float,
                        help="Proportion of tokens to randomly replace.")
    parser.add_argument(
        "--mlm_smoothing",
        default=0.7,
        type=float,
        help=
        "Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec)."
    )
    parser.add_argument(
        "--restrict_ce_to_mask",
        action='store_true',
        help=
        "If true, compute the distilation loss only the [MLM] prediction distribution."
    )

    parser.add_argument("--n_epoch",
                        type=int,
                        default=3,
                        help="Number of pass on the whole dataset.")
    parser.add_argument("--batch_size",
                        type=int,
                        default=5,
                        help="Batch size (for each process).")
    parser.add_argument(
        "--tokens_per_batch",
        type=int,
        default=-1,
        help=
        "If specified, modify the batches so that they have approximately this number of tokens."
    )
    parser.add_argument(
        "--shuffle",
        action='store_false',
        help="If true, shuffle the sequence order. Default is true.")
    parser.add_argument(
        "--group_by_size",
        action='store_false',
        help=
        "If true, group sequences that have similar length into the same batch. Default is true."
    )

    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=50,
        help="Gradient accumulation for larger training batches.")
    parser.add_argument("--warmup_prop",
                        default=0.05,
                        type=float,
                        help="Linear warmup proportion.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--learning_rate",
                        default=5e-4,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--adam_epsilon",
                        default=1e-6,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=5.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--initializer_range",
                        default=0.02,
                        type=float,
                        help="Random initialization range.")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--n_gpu",
                        type=int,
                        default=1,
                        help="Number of GPUs in the node.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="Distributed training - Local rank")
    parser.add_argument("--seed", type=int, default=56, help="Random seed")

    parser.add_argument("--log_interval",
                        type=int,
                        default=500,
                        help="Tensorboard logging interval.")
    parser.add_argument("--checkpoint_interval",
                        type=int,
                        default=4000,
                        help="Checkpoint interval.")
    args = parser.parse_args()

    ## ARGS ##
    init_gpu_params(args)
    set_seed(args)
    if args.is_master:
        if os.path.exists(args.dump_path):
            if not args.force:
                raise ValueError(
                    f'Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it'
                    'Use `--force` if you want to overwrite it')
            else:
                shutil.rmtree(args.dump_path)

        if not os.path.exists(args.dump_path):
            os.makedirs(args.dump_path)
        logger.info(
            f'Experiment will be dumped and logged in {args.dump_path}')

        ### SAVE PARAMS ###
        logger.info(f'Param: {args}')
        with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
            json.dump(vars(args), f, indent=4)
        git_log(args.dump_path)
    assert (args.from_pretrained_weights is None and args.from_pretrained_config is None) or \
           (args.from_pretrained_weights is not None and args.from_pretrained_config is not None)

    ### TOKENIZER ###
    if args.teacher_type == 'bert':
        tokenizer = BertTokenizer.from_pretrained(args.teacher_name)
    elif args.teacher_type == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained(args.teacher_name)
    special_tok_ids = {}
    for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
        idx = tokenizer.all_special_tokens.index(tok_symbol)
        special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
    logger.info(f'Special tokens {special_tok_ids}')
    args.special_tok_ids = special_tok_ids

    ## DATA LOADER ##
    logger.info(f'Loading data from {args.data_file}')
    with open(args.data_file, 'rb') as fp:
        data = pickle.load(fp)

    assert os.path.isfile(args.token_counts)
    logger.info(
        f'Loading token counts from {args.token_counts} (already pre-computed)'
    )
    with open(args.token_counts, 'rb') as fp:
        counts = pickle.load(fp)
        assert len(counts) == args.vocab_size
    token_probs = np.maximum(counts, 1)**-args.mlm_smoothing
    for idx in special_tok_ids.values():
        token_probs[idx] = 0.  # do not predict special tokens
    token_probs = torch.from_numpy(token_probs)

    train_dataloader = Dataset(params=args, data=data)
    logger.info(f'Data loader created.')

    ## STUDENT ##
    if args.from_pretrained_weights is not None:
        assert os.path.isfile(args.from_pretrained_weights)
        assert os.path.isfile(args.from_pretrained_config)
        logger.info(
            f'Loading pretrained weights from {args.from_pretrained_weights}')
        logger.info(
            f'Loading pretrained config from {args.from_pretrained_config}')
        stu_architecture_config = DistilBertConfig.from_json_file(
            args.from_pretrained_config)
        stu_architecture_config.output_hidden_states = True
        student = DistilBertForMaskedLM.from_pretrained(
            args.from_pretrained_weights, config=stu_architecture_config)
    else:
        args.vocab_size_or_config_json_file = args.vocab_size
        stu_architecture_config = DistilBertConfig(**vars(args),
                                                   output_hidden_states=True)
        student = DistilBertForMaskedLM(stu_architecture_config)

    if args.n_gpu > 0:
        student.to(f'cuda:{args.local_rank}')
    logger.info(f'Student loaded.')

    ## TEACHER ##
    if args.teacher_type == 'bert':
        teacher = BertForMaskedLM.from_pretrained(args.teacher_name,
                                                  output_hidden_states=True)
    elif args.teacher_type == 'roberta':
        teacher = RobertaForMaskedLM.from_pretrained(args.teacher_name,
                                                     output_hidden_states=True)
    if args.n_gpu > 0:
        teacher.to(f'cuda:{args.local_rank}')
    logger.info(f'Teacher loaded from {args.teacher_name}.')

    ## DISTILLER ##
    torch.cuda.empty_cache()
    distiller = Distiller(params=args,
                          dataloader=train_dataloader,
                          token_probs=token_probs,
                          student=student,
                          teacher=teacher)
    distiller.train()
    logger.info("Let's go get some drinks.")
def main(args):
    assert args.path is not None, '--path required for generation!'
    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert args.replace_unk is None or args.raw_text, \
        '--replace-unk requires a raw text dataset (--raw-text)'
    # if args.save_path is not None:
    #     if check_file_exists(args):
    #         return
    import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    # print(args)
    utils.xpprint(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset)
    print('| {} {} {} examples'.format(args.data, args.gen_subset,
                                       len(task.dataset(args.gen_subset))))

    # Set dictionaries
    try:
        src_dict = getattr(task, 'source_dictionary', None)
    except NotImplementedError:
        src_dict = None
    tgt_dict = task.target_dictionary

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    models, _model_args = utils.load_ensemble_for_inference(
        args.path.split(':'),
        task,
        model_arg_overrides=eval(args.model_overrides),
    )

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Load dataset (possibly sharded)
    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            # *[model.max_positions() for model in models]
        ),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=args.required_batch_size_multiple,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    generator = task.build_generator(args)

    # Generate and compute BLEU score
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    num_sentences = 0
    has_target = True
    if args.isRoberta:
        from pytorch_transformers import RobertaTokenizer
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    else:
        tokenizer = None
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        for sample in t:
            sample = utils.move_to_cuda(sample) if use_cuda else sample
            if 'net_input' not in sample:
                continue

            prefix_tokens = None
            if args.prefix_size > 0:
                prefix_tokens = sample['target'][:, :args.prefix_size]

            gen_timer.start()
            hypos = task.inference_step(generator, models, sample,
                                        prefix_tokens)
            num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)
            gen_timer.stop(num_generated_tokens)

            for i, sample_id in enumerate(sample['id'].tolist()):
                has_target = sample['target'] is not None

                # Remove padding
                src_tokens = utils.strip_pad(
                    sample['net_input']['src_tokens'][i, :], tgt_dict.pad())
                target_tokens = None
                if has_target:
                    target_tokens = utils.strip_pad(
                        sample['target'][i, :], tgt_dict.pad()).int().cpu()

                # Either retrieve the original sentences or regenerate them from tokens.
                if align_dict is not None:
                    src_str = task.dataset(
                        args.gen_subset).src.get_original_text(sample_id)
                    target_str = task.dataset(
                        args.gen_subset).tgt.get_original_text(sample_id)
                else:
                    if src_dict is not None:
                        src_str = src_dict.string(src_tokens, args.remove_bpe)
                    else:
                        src_str = ""
                    if has_target:
                        target_str = tgt_dict.string(target_tokens,
                                                     args.remove_bpe,
                                                     escape_unk=True)

                if not args.quiet:
                    if src_dict is not None:
                        if not args.isRoberta:
                            print('S-{}\t{}'.format(sample_id, src_str))
                        else:
                            src_text = ''.join(src_str.strip().split())
                            src_out = tokenizer.convert_tokens_to_string(
                                src_text)
                            print('S-{}\t{}'.format(sample_id, src_out))
                    if has_target:
                        if not args.isRoberta:
                            print('T-{}\t{}'.format(sample_id, target_str))
                        else:
                            tgt_text = ''.join(target_str.strip().split())
                            tgt_out = tokenizer.convert_tokens_to_string(
                                tgt_text)
                            print('T-{}\t{}'.format(sample_id, tgt_out))
                # Process top predictions
                for i, hypo in enumerate(
                        hypos[i][:min(len(hypos), args.nbest)]):
                    hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                        hypo_tokens=hypo['tokens'].int().cpu(),
                        src_str=src_str,
                        alignment=hypo['alignment'].int().cpu()
                        if hypo['alignment'] is not None else None,
                        align_dict=align_dict,
                        tgt_dict=tgt_dict,
                        remove_bpe=args.remove_bpe,
                    )

                    if not args.quiet:
                        if not args.isRoberta:
                            print('H-{}\t{}\t{}'.format(
                                sample_id, hypo['score'], hypo_str))
                        else:
                            hypo_text = ''.join(hypo_str.strip().split())
                            hypo_out = tokenizer.convert_tokens_to_string(
                                hypo_text)
                            print('H-{}\t{}\t{}'.format(
                                sample_id, hypo['score'], hypo_out))
                        print('P-{}\t{}'.format(
                            sample_id, ' '.join(
                                map(
                                    lambda x: '{:.4f}'.format(x),
                                    hypo['positional_scores'].tolist(),
                                ))))

                        if args.print_alignment:
                            print('A-{}\t{}'.format(
                                sample_id, ' '.join(
                                    map(lambda x: str(utils.item(x)),
                                        alignment))))

                    # Score only the top hypothesis
                    if has_target and i == 0:
                        if align_dict is not None or args.remove_bpe is not None:
                            # Convert back to tokens for evaluation with unk replacement and/or without BPE
                            target_tokens = tgt_dict.encode_line(
                                target_str, add_if_not_exist=True)
                        if hasattr(scorer, 'add_string'):
                            scorer.add_string(target_str, hypo_str)
                        else:
                            scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(num_generated_tokens)
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += sample['nsentences']

    print(
        '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(args.gen_subset,
                                                      args.beam,
                                                      scorer.result_string()))
    return scorer
path_to_wsc = '../data/wsc_data/enhanced.tense.random.role.syn.voice.scramble.freqnoun.gender.number.adverb.tsv'
wsc_datapoints = pd.read_csv(path_to_wsc, sep='\t')


def replace_pronoun(tokenized_text, pronoun_index, tokenized_option):
    tokenized_text = tokenized_text[:
                                    pronoun_index] + tokenized_option + tokenized_text[
                                        pronoun_index:]
    new_pronoun_index = pronoun_index + len(tokenized_option)
    tokenized_text.pop(new_pronoun_index)
    return tokenized_text


# Load pre-trained model tokenizer (vocabulary)
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

correct_preds = 0
correct_preds_enhanced = 0
stability_match = 0

all_preds = 0

# Load pre-trained model (weights)
model = RobertaForMaskedLM.from_pretrained('roberta-large')
model.eval()

for q_index, dp_split in wsc_datapoints.iterrows():
    if dp_split['text_adverb'].replace(
            ' ', '') != '-' and dp_split['text_adverb'].replace(' ', ''):
Exemple #26
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--model_type", default='roberta', type=str, required=True,
                        help="Model type selected in the list")
    parser.add_argument("--model_name_or_path", default='roberta-large', type=str, required=True,
                        help="Path to pre-trained model or shortcut name selected in the list: ")
    parser.add_argument("--task_name", default=None, type=str, required=True,
                        help="The name of the task to train.")
    parser.add_argument("--comment", default='', type=str,
                        help="The comment")
    parser.add_argument('--output_dir', type=Path, default="output")
    parser.add_argument("--restore", type=bool, default=True, help="Whether restore from the last checkpoint, is nochenckpoints, start from scartch")

    parser.add_argument("--max_seq_length", type=int, default=256, help="max lenght of token sequence")
    parser.add_argument("--do_train", action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--evaluate_during_training", type=bool, default=False,
                        help="Rul evaluation during training at each logging step.")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--adapter_transformer_layers", default=2, type=int,
                        help="The transformer layers of adapter.")
    parser.add_argument("--adapter_size", default=768, type=int,
                        help="The hidden size of adapter.")
    parser.add_argument("--adapter_list", default="0,11,23", type=str,
                        help="The layer where add an adapter")
    parser.add_argument("--adapter_skip_layers", default=0, type=int,
                        help="The skip_layers of adapter according to bert layers")
    parser.add_argument('--meta_adapter_model', type=str, help='the pretrained adapter model')

    parser.add_argument("--per_gpu_train_batch_size", default=32, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument('--gradient_accumulation_steps',type=int,default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--learning_rate", default=3e-5,type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=3, type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps', type=int, default=10,
                        help="How often do we snapshot losses, for inclusion in the progress dump? (0 = disable)")
    parser.add_argument('--save_steps', type=int, default=1000,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument('--eval_steps', type=int, default=None,
                        help="eval every X updates steps.")
    parser.add_argument('--max_save_checkpoints', type=int, default=500,
                        help="The max amounts of checkpoint saving. Bigger than it will delete the former checkpoints")
    parser.add_argument("--eval_all_checkpoints", action='store_true',
                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
    parser.add_argument("--no_cuda", action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument('--overwrite_cache', action='store_true',
                        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")

    parser.add_argument('--fp16', action='store_true',
                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
    parser.add_argument('--fp16_opt_level', type=str, default='O1',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
    parser.add_argument('--negative_sample', type=int, default=0, help='how many negative samples to select')

    # args
    args = parser.parse_args()

    args.adapter_list = args.adapter_list.split(',')
    args.adapter_list = [int(i) for i in args.adapter_list]

    name_prefix = 'maxlen-'+str(args.max_seq_length)+'_'+'epoch-'+str(args.num_train_epochs)+'_'+'batch-'+str(args.per_gpu_train_batch_size)+'_'+'lr-'+str(args.learning_rate)+'_'+'warmup-'+str(args.warmup_steps)+'_'+str(args.comment)
    args.my_model_name = args.task_name+'_'+name_prefix
    args.output_dir = os.path.join(args.output_dir, args.my_model_name)

    if args.eval_steps is None:
        args.eval_steps = args.save_steps*2

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

    # Set seed
    set_seed(args)

    args.output_mode = output_modes[args.task_name]

    processor = processors[args.task_name]()
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    config = RobertaConfig.from_pretrained('roberta-large', output_attentions=True)

    pretrained_model = PretrainedModel()
    adapter_model = AdapterModel(args, pretrained_model.config)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    pretrained_model.to(args.device)
    adapter_model.to(args.device)
    model = (pretrained_model, adapter_model)

    logger.info("Training/evaluation parameters %s", args)
    val_dataset = load_and_cache_examples(args, args.task_name, tokenizer, 'dev', evaluate=True)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, 'train', evaluate=False)

        global_step, tr_loss = train(args, train_dataset, val_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        model_to_save = model.module if hasattr(adapter_model, 'module') else adapter_model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
Exemple #27
0
    def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        print('input_prefix', input_prefix)
        print(dict_path(lang))

        dict = roberta_dictionary.RobertaDictionary.load(dict_path(lang))
        input_file = "{}{}".format(
            input_prefix, ("." + lang) if lang is not None else ""
        )
        from pytorch_transformers import RobertaTokenizer
        import torch

        tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

        def penn_token2orig_token(sent):
            # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB-
            penn2orig = {"``":'"', "''": '"',
                         "-LRB-": '(', "-RRB-": ')',
                         "-LSB-":'[', "-RSB-":']',
                         "-LCB-":'{', "-RCB-":'}',
                         "-lrb-": '(', "-rrb-": ')',
                         "-lsb-": '[', "-rsb-": ']',
                         "-lcb-": '{', "-rcb-": '}',
                         }
            words = sent.strip().split()
            words = [wd if not wd in penn2orig else penn2orig[wd] for wd in words]
            return ' '.join(words)

        num_token, num_unk_token = 0, 0
        num_seq = 0
        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin")
        )
        truncated_number = 512 if lang == 'article' else 256
        CLS_TOKEN = '<s>'
        SEP_TOKEN = '</s>'
        if lang == 'article':
            for line in open(input_file, encoding='utf8'):
                article_wids = []
                min_src_sentence = 3
                max_src_sentence = 100
                max_src_ntokens_per_sent = 200
                min_src_ntokens_per_sent = 5
                sents = line.strip().split('<S_SEP>')
                sents = [sent.strip().split() for sent in sents]
                idxs = [i for i, sent in enumerate(sents) if (len(sent) > min_src_ntokens_per_sent)]
                src = [sents[i][:max_src_ntokens_per_sent] for i in idxs]
                src = src[:max_src_sentence]
                src_txt = [' '.join(sent) for sent in src]
                src_tokens = [tokenizer.tokenize(sent) for sent in src_txt]
                for i, sent in enumerate(src_tokens):
                    MAX_SENT_NTOKENS = 500
                    if len(sent) > MAX_SENT_NTOKENS:
                        sent = sent[:MAX_SENT_NTOKENS]
                    if i == 0:
                        input_text = [CLS_TOKEN] + sent + [SEP_TOKEN]
                    elif i != 0:
                        input_text = [SEP_TOKEN] + sent + [SEP_TOKEN]
                    wids = tokenizer.convert_tokens_to_ids(input_text)
                    article_wids.extend(wids)
                    for wid in wids:
                        if wid == dict.unk_index:
                            num_unk_token += 1
                        num_token += 1
                num_seq += 1
                article_wids = article_wids[:truncated_number] if len(article_wids) > truncated_number else article_wids
                if article_wids[-1] != dict.sep_index:
                    article_wids[-1] = dict.sep_index
                tensor = torch.IntTensor(article_wids)
                # print( dict.string_complete(tensor) )
                ds.add_item(tensor)
            ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
        elif lang == 'summary':
            for line in open(input_file, encoding='utf8'):
                article_wids = []
                max_tgt_ntokens = 500
                min_tgt_ntokens = 5
                sents = line.strip().split('<S_SEP>')
                sents = [tokenizer.tokenize(sent) for sent in sents]
                for i, sent in enumerate(sents):
                    # sometimes, there are too many token in one single sentence
                    # to be specific, there are 8 sentences in the training article longer than 512, so truncate them to 500
                    # MAX_SENT_LEN = 500
                    # if len(sent) > MAX_SENT_LEN:
                    #     sent = sent[:MAX_SENT_LEN]
                    if i != 0:
                        input_text = [SEP_TOKEN] + sent
                    else:
                        input_text = sent
                    wids = tokenizer.convert_tokens_to_ids(input_text)
                    # wtoks = tokenizer.convert_ids_to_tokens(wids)
                    # wstring = tokenizer.convert_tokens_to_string(wtoks)

                    # wids_vocab = [dict.index(word) for word in input_text]
                    # assert wids == wids_vocab, 'word indices should be the same!'
                    article_wids.extend(wids)
                    for wid in wids:
                        if wid == dict.unk_index:
                            num_unk_token += 1
                        num_token += 1

                num_seq += 1
                article_wids = article_wids[:truncated_number] if len(article_wids) > truncated_number else article_wids
                if article_wids[-1] == dict.sep_index:
                    article_wids = article_wids[:len(article_wids)-1]
                # print(article_wids)
                if len(article_wids) > truncated_number:
                    print('lang: %s, token len: %d, truncated len: %d' % (lang, len(article_wids), truncated_number))

                tensor = torch.IntTensor(article_wids)
                # print( dict.string_complete(tensor) )
                ds.add_item(tensor)
            ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, num_seq, num_token,
            100 * num_unk_token / num_token, dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))
    def __init__(self, args, dictionary, embed_tokens, left_pad=False):
        super().__init__(dictionary)
        self.dropout = args.dropout

        # from pytorch_transformers import RobertaModel
        from fairseq.modules.roberta_causal_mask import RobertaCasulMaskModel, BertCasulMaskModel
        from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE
        from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer

        if args.roberta_model.startswith('roberta'):
            self.roberta = RobertaCasulMaskModel.from_pretrained(
                args.roberta_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            self.config = RobertaConfig.from_pretrained(args.roberta_model)
            self.tokenizer = RobertaTokenizer.from_pretrained(
                args.roberta_model)
        else:
            self.roberta = BertCasulMaskModel.from_pretrained(
                args.roberta_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            self.config = BertConfig.from_pretrained(args.roberta_model)
            self.tokenizer = BertTokenizer.from_pretrained(args.roberta_model)
        self.config.output_attentions = True
        self.roberta.pooler.dense.weight.requires_grad = False
        self.roberta.pooler.dense.bias.requires_grad = False

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx

        # self.embed_tokens = embed_tokens
        # self.embed_scale = math.sqrt(embed_dim)

        self.args = args

        # if args.sentence_transformer_arch == 'fairseq':
        #     self.padding_idx = embed_tokens.padding_idx

        #     self.sent_embed_positions = PositionalEmbedding(
        #         1024, embed_dim, self.padding_idx,
        #         left_pad=False,
        #         learned=args.encoder_learned_pos,
        #     )

        #     self.doc_layers = nn.ModuleList([])
        #     self.doc_layers.extend([
        #         TransformerEncoderLayer(args)
        #         for i in range(args.encoder_layers)
        #     ])
        if args.sentence_transformer_arch == 'bert':
            # from pytorch_transformers import RobertaConfig, RobertaTokenizer

            # self.config = RobertaConfig.from_pretrained(args.roberta_model)
            # self.config.output_attentions = True
            # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

            embed_dim = self.config.hidden_size
            print('*** padding idx before ***', embed_tokens.padding_idx)
            self.padding_idx = self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.pad_token)
            print('*** padding idx after ***', self.padding_idx)

            # let's assume each document has at most 128-self.padding_idx-1 sentences
            # in case of roberta, it is 126
            self.sent_position_embeddings = nn.Embedding(128, embed_dim)
            if args.encoder_layers:
                self.config.num_hidden_layers = args.encoder_layers
            if args.dropout:
                self.config.hidden_dropout_prob = args.dropout
            if args.attention_dropout:
                self.config.attention_probs_dropout_prob = args.attention_dropout
            if args.attn_type == 'attn_score':
                self.sent_encoder = AttnScoreBertEncoder(self.config)
            elif args.attn_type == 'attn_prob':
                self.sent_encoder = BertEncoder(self.config)
            else:
                raise Exception('--attn-type doesn\'t support {} yet !'.format(
                    args.attn_type))
            self.sent_encoder.apply(self._init_weights)

            print('*** sentence encoder config ***')
            print(self.config)
        else:
            raise Exception(
                '--sentence-transformer-arch doesn\'t support {} yet!'.format(
                    args.sentence_transformer_arch))
Exemple #29
0
    print(srl)

    crf = ConditionalRandomField(len(roles_to_idx),
                                 None,
                                 include_start_end_transitions=True)
    print(crf)

    model_parameters = filter(lambda p: p.requires_grad,
                              chain(srl.parameters(), crf.parameters()))

    num_params = sum([np.prod(p.size()) for p in model_parameters])
    print("Total parameters =", num_params)
    print(params)

    if params.use_bert:
        bert_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        bert_model = RobertaModel.from_pretrained("roberta-base",
                                                  output_hidden_states=True)
        if params.gpu_id > -1:
            bert_model.cuda()
    else:
        bert_tokenizer = None
        bert_model = None
    if params.gpu_id > -1:
        srl.cuda()
        crf.cuda()

    srl.load_state_dict(torch.load(os.path.join(params.dir, params.modelname)))

    crf.load_state_dict(
        torch.load(os.path.join(params.dir, params.modelname + "crf")))
Exemple #30
0
def main():
    args = parse_arguments()
    # argument setting
    print("=== Argument Setting ===")
    print("src: " + args.src)
    print("tgt: " + args.tgt)
    print("seed: " + str(args.seed))
    print("train_seed: " + str(args.train_seed))
    print("model_type: " + str(args.model))
    print("max_seq_length: " + str(args.max_seq_length))
    print("batch_size: " + str(args.batch_size))
    print("pre_epochs: " + str(args.pre_epochs))
    print("num_epochs: " + str(args.num_epochs))
    print("temperature: " + str(args.temperature))
    set_seed(args.train_seed)

    if args.model == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # preprocess data
    print("=== Processing datasets ===")
    if args.src == 'blog':
        src_x, src_y = CSV2Array(os.path.join('data', args.src, 'blog.csv'))

    elif args.src == 'airline':
        src_x, src_y = CSV2Array(os.path.join('data', args.src, 'airline.csv'))

    else:
        src_x, src_y = XML2Array(
            os.path.join('data', args.src, 'negative.review'),
            os.path.join('data', args.src, 'positive.review'))

    src_x, src_test_x, src_y, src_test_y = train_test_split(
        src_x, src_y, test_size=0.2, stratify=src_y, random_state=args.seed)

    if args.tgt == 'blog':
        tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'blog.csv'))

    elif args.tgt == 'airline':
        tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'airline.csv'))
    else:
        tgt_x, tgt_y = XML2Array(
            os.path.join('data', args.tgt, 'negative.review'),
            os.path.join('data', args.tgt, 'positive.review'))

    tgt_train_x, tgt_test_y, tgt_train_y, tgt_test_y = train_test_split(
        tgt_x, tgt_y, test_size=0.2, stratify=tgt_y, random_state=args.seed)

    if args.model == 'roberta':
        src_features = roberta_convert_examples_to_features(
            src_x, src_y, args.max_seq_length, tokenizer)
        src_test_features = roberta_convert_examples_to_features(
            src_test_x, src_test_y, args.max_seq_length, tokenizer)
        tgt_features = roberta_convert_examples_to_features(
            tgt_x, tgt_y, args.max_seq_length, tokenizer)
        tgt_train_features = roberta_convert_examples_to_features(
            tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer)
    else:
        src_features = convert_examples_to_features(src_x, src_y,
                                                    args.max_seq_length,
                                                    tokenizer)
        src_test_features = convert_examples_to_features(
            src_test_x, src_test_y, args.max_seq_length, tokenizer)
        tgt_features = convert_examples_to_features(tgt_x, tgt_y,
                                                    args.max_seq_length,
                                                    tokenizer)
        tgt_train_features = convert_examples_to_features(
            tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer)

    # load dataset

    src_data_loader = get_data_loader(src_features, args.batch_size)
    src_data_eval_loader = get_data_loader(src_test_features, args.batch_size)
    tgt_data_train_loader = get_data_loader(tgt_train_features,
                                            args.batch_size)
    tgt_data_all_loader = get_data_loader(tgt_features, args.batch_size)

    # load models
    if args.model == 'bert':
        src_encoder = BertEncoder()
        tgt_encoder = BertEncoder()
        src_classifier = BertClassifier()
    elif args.model == 'distilbert':
        src_encoder = DistilBertEncoder()
        tgt_encoder = DistilBertEncoder()
        src_classifier = BertClassifier()
    else:
        src_encoder = RobertaEncoder()
        tgt_encoder = RobertaEncoder()
        src_classifier = RobertaClassifier()
    discriminator = Discriminator()

    if args.load:
        src_encoder = init_model(args,
                                 src_encoder,
                                 restore=param.src_encoder_path)
        src_classifier = init_model(args,
                                    src_classifier,
                                    restore=param.src_classifier_path)
        tgt_encoder = init_model(args,
                                 tgt_encoder,
                                 restore=param.tgt_encoder_path)
        discriminator = init_model(args,
                                   discriminator,
                                   restore=param.d_model_path)
    else:
        src_encoder = init_model(args, src_encoder)
        src_classifier = init_model(args, src_classifier)
        tgt_encoder = init_model(args, tgt_encoder)
        discriminator = init_model(args, discriminator)

    # train source model
    print("=== Training classifier for source domain ===")
    if args.pretrain:
        src_encoder, src_classifier = pretrain(args, src_encoder,
                                               src_classifier, src_data_loader)

    # eval source model
    print("=== Evaluating classifier for source domain ===")
    evaluate(src_encoder, src_classifier, src_data_loader)
    evaluate(src_encoder, src_classifier, src_data_eval_loader)
    evaluate(src_encoder, src_classifier, tgt_data_all_loader)

    for params in src_encoder.parameters():
        params.requires_grad = False

    for params in src_classifier.parameters():
        params.requires_grad = False

    # train target encoder by GAN
    print("=== Training encoder for target domain ===")
    if args.adapt:
        tgt_encoder.load_state_dict(src_encoder.state_dict())
        tgt_encoder = adapt(args, src_encoder, tgt_encoder, discriminator,
                            src_classifier, src_data_loader,
                            tgt_data_train_loader, tgt_data_all_loader)

    # eval target encoder on lambda0.1 set of target dataset
    print("=== Evaluating classifier for encoded target domain ===")
    print(">>> source only <<<")
    evaluate(src_encoder, src_classifier, tgt_data_all_loader)
    print(">>> domain adaption <<<")
    evaluate(tgt_encoder, src_classifier, tgt_data_all_loader)