Beispiel #1
0
 def __init__(self, cfg, device):
     super().__init__()
     tokenizer = RobertaTokenizerFast.from_pretrained('./bird_bpe_vocab', max_len=256)
     _config = RobertaConfig(
         vocab_size=tokenizer._tokenizer.get_vocab_size(),
         hidden_size=512,
         num_hidden_layers=4,
         num_attention_heads=8,
         max_position_embeddings=256,
         pad_token_id=1,
         eos_token_id=0,
         bos_token_id=2,
         output_attentions=False,
         output_hidden_states=False
     )
     _model = RobertaForMaskedLM(_config)
     _model.load_state_dict(torch.load('bert_small/checkpoint-1100/pytorch_model.bin'))
     _model.eval()
     self.tokenizer = tokenizer
     self._model = _model
     self.device = device
     self.pad_token = 0
     self.batch_size = cfg.batch_size
     self.proj = None
     if cfg.proj_lang:
         self.proj = nn.Sequential(*[EqualisedLinearLayer(512, cfg.latent_dim, weight_scaling=cfg.weight_scaling), nn.Tanh()])
Beispiel #2
0
def main():
    random.seed(1012)
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    chars = string.ascii_lowercase
    number_of_entity_trials = 10

    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt'
    # state_dict = torch.load(checkpoint_path)["model"]
    # roberta = RobertaForMaskedLM.from_pretrained('roberta-base', state_dict=state_dict)

    # # Initializing a RoBERTa configuration
    # config = RobertaConfig.from_pretrained('roberta-base')
    # # Initializing a model from the configuration
    # roberta = RobertaForMaskedLM(config)
    # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt'
    # state_dict = torch.load(checkpoint_path)["model"]
    # roberta.load_state_dict(state_dict)

    roberta = HappyROBERTA('roberta-large')

    config = RobertaConfig.from_pretrained('roberta-large')
    mlm = RobertaForMaskedLM(config)
    #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt'
    #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/save_step_230400/checkpoint.pt'
    #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/roberta_base_best_sample_from_sets/checkpoint.pt'
    checkpoint_path = '../data/finetune_data/roberta-large/save_step_57000/checkpoint.pt'
    state_dict = torch.load(checkpoint_path)["model"]
    mlm.load_state_dict(state_dict)
    mlm.eval()

    roberta.mlm = mlm

    fictitious_entities = proc.generate_pairs_of_random_strings(
        number_of_pairs=100, min_length=3, max_length=12, character_set=chars)
    with open("../data/truism_data/physical_data_sentences_2.json", "r") as f:
        physical_sents = json.load(f)

    with open("../data/truism_data/physical_data_2.json", "r") as f:
        physical_config = json.load(f)

    with open("../data/finetune_data/sample_from_sets/test_keys.json",
              "r") as f:
        test_keys = json.load(f)

    phy_filtered = {}
    for key in test_keys['phy']:
        index = key.split("-")[0]
        ling_pert = key.split("-")[1]
        asym_pert = key.split("-")[2]
        if index not in phy_filtered.keys():
            phy_filtered[index] = {}
            phy_filtered[index][ling_pert] = {}
            phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][
                ling_pert][asym_pert]
        elif ling_pert not in phy_filtered[index].keys():
            phy_filtered[index][ling_pert] = {}
            phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][
                ling_pert][asym_pert]
        else:
            phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][
                ling_pert][asym_pert]
    # physical_sents = {k: physical_sents[k] for k in ('11', '16')}
    # physical_config  = {k: physical_config[k] for k in ('11', '16')}

    logger.info("finished reading in physical data")

    output_df = run_pipeline(model=roberta,
                             tokenizer=tokenizer,
                             fictitious_entities=fictitious_entities,
                             sentences=phy_filtered,
                             config=physical_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta/sample_from_set/physical_perf_ft19_new_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical dataset results")

    with open("../data/truism_data/material_data_sentences_2.json", "r") as f:
        material_sents = json.load(f)

    with open("../data/truism_data/material_data_2.json", "r") as f:
        material_config = json.load(f)

    mat_filtered = {}
    for key in test_keys['mat']:
        index = key.split("-")[0]
        ling_pert = key.split("-")[1]
        asym_pert = key.split("-")[2]
        if index not in mat_filtered.keys():
            mat_filtered[index] = {}
            mat_filtered[index][ling_pert] = {}
            mat_filtered[index][ling_pert][asym_pert] = material_sents[index][
                ling_pert][asym_pert]
        elif ling_pert not in mat_filtered[index].keys():
            mat_filtered[index][ling_pert] = {}
            mat_filtered[index][ling_pert][asym_pert] = material_sents[index][
                ling_pert][asym_pert]
        else:
            mat_filtered[index][ling_pert][asym_pert] = material_sents[index][
                ling_pert][asym_pert]

    logger.info("finished reading in material data")

    output_df = run_pipeline(model=roberta,
                             tokenizer=tokenizer,
                             fictitious_entities=fictitious_entities,
                             sentences=mat_filtered,
                             config=material_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta/sample_from_set/material_perf_ft19_new_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical material results")

    with open("../data/truism_data/social_data_sentences_2.json", "r") as f:
        social_sents = json.load(f)

    with open("../data/truism_data/social_data_2.json", "r") as f:
        social_config = json.load(f)

    soc_filtered = {}
    for key in test_keys['soc']:
        index = key.split("-")[0]
        ling_pert = key.split("-")[1]
        asym_pert = key.split("-")[2]
        if index not in soc_filtered.keys():
            soc_filtered[index] = {}
            soc_filtered[index][ling_pert] = {}
            soc_filtered[index][ling_pert][asym_pert] = social_sents[index][
                ling_pert][asym_pert]
        elif ling_pert not in soc_filtered[index].keys():
            soc_filtered[index][ling_pert] = {}
            soc_filtered[index][ling_pert][asym_pert] = social_sents[index][
                ling_pert][asym_pert]
        else:
            soc_filtered[index][ling_pert][asym_pert] = social_sents[index][
                ling_pert][asym_pert]

    logger.info("finished reading in social data")

    output_df = run_pipeline(model=roberta,
                             tokenizer=tokenizer,
                             fictitious_entities=fictitious_entities,
                             sentences=soc_filtered,
                             config=social_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta/sample_from_set/social_perf_ft19_new_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical social results")
Beispiel #3
0
class Roberta(object):
    def __init__(self, args):
        # self.dict_file = "{}/{}".format(args.roberta_model_dir, args.roberta_vocab_name)
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        if args.model_path is not None:
            print("Testing CoLAKE...")
            print('loading model parameters from {}...'.format(
                args.model_path))
            config = RobertaConfig.from_pretrained('roberta-base',
                                                   type_vocab_size=3)
            self.model = RobertaForMaskedLM(config=config)
            states_dict = torch.load(os.path.join(args.model_path,
                                                  'model.bin'))
            self.model.load_state_dict(states_dict, strict=False)
        else:
            print("Testing RoBERTa baseline...")
            self.model = RobertaForMaskedLM.from_pretrained('roberta-base')

        self._build_vocab()
        self._init_inverse_vocab()
        self._model_device = 'cpu'
        self.max_sentence_length = args.max_sentence_length

    def _cuda(self):
        self.model.cuda()

    def _build_vocab(self):
        self.vocab = []
        for key in range(len(self.tokenizer)):
            value = self.tokenizer.decode([key])
            if value[0] == " ":  # if the token starts with a whitespace
                value = value.strip()
            else:
                # this is subword information
                value = "_{}_".format(value)

            if value in self.vocab:
                # print("WARNING: token '{}' is already in the vocab".format(value))
                value = "{}_{}".format(value, key)

            self.vocab.append(value)
        print("size of vocabulary: {}".format(len(self.vocab)))

    def _init_inverse_vocab(self):
        self.inverse_vocab = {w: i for i, w in enumerate(self.vocab)}

    def try_cuda(self):
        """Move model to GPU if one is available."""
        if torch.cuda.is_available():
            if self._model_device != 'cuda':
                self._cuda()
                self._model_device = 'cuda'
        else:
            print('No CUDA found')

    def init_indices_for_filter_logprobs(self, vocab_subset):
        index_list = []
        new_vocab_subset = []
        for word in vocab_subset:
            if word in self.inverse_vocab:
                inverse_id = self.inverse_vocab[word]
                index_list.append(inverse_id)
                new_vocab_subset.append(word)
            else:
                msg = "word {} from vocab_subset not in model vocabulary!".format(
                    word)
                print("WARNING: {}".format(msg))

        indices = torch.as_tensor(index_list)
        return indices, index_list

    def filter_logprobs(self, log_probs, indices):
        new_log_probs = log_probs.index_select(dim=2, index=indices)
        return new_log_probs

    def get_id(self, input_string):
        # Roberta predicts ' London' and not 'London'
        string = " " + str(input_string).strip()
        tokens = self.tokenizer.encode(string, add_special_tokens=False)
        # return [element.item() for element in tokens.long().flatten()]
        return tokens

    def get_batch_generation(self, samples_list, try_cuda=True):
        if not samples_list:
            return None
        if try_cuda:
            self.try_cuda()

        tensor_list = []
        masked_indices_list = []
        max_len = 0
        output_tokens_list = []
        seq_len = []
        for sample in samples_list:
            masked_inputs_list = sample["masked_sentences"]

            tokens_list = [self.tokenizer.bos_token_id]

            for idx, masked_input in enumerate(masked_inputs_list):
                tokens_list.extend(
                    self.tokenizer.encode(" " + masked_input.strip(),
                                          add_special_tokens=False))
                tokens_list.append(self.tokenizer.eos_token_id)

            # tokens = torch.cat(tokens_list)[: self.max_sentence_length]
            tokens = torch.tensor(tokens_list)[:self.max_sentence_length]
            output_tokens_list.append(tokens.long().cpu().numpy())

            seq_len.append(len(tokens))
            if len(tokens) > max_len:
                max_len = len(tokens)
            tensor_list.append(tokens)
            masked_index = (
                tokens == self.tokenizer.mask_token_id).nonzero().numpy()
            for x in masked_index:
                masked_indices_list.append([x[0]])
        tokens_list = []
        for tokens in tensor_list:
            pad_lenght = max_len - len(tokens)
            if pad_lenght > 0:
                pad_tensor = torch.full([pad_lenght],
                                        self.tokenizer.pad_token_id,
                                        dtype=torch.int)
                tokens = torch.cat((tokens, pad_tensor.long()))
            tokens_list.append(tokens)

        batch_tokens = torch.stack(tokens_list)
        seq_len = torch.LongTensor(seq_len)
        attn_mask = seq_len_to_mask(seq_len)

        with torch.no_grad():
            # with utils.eval(self.model.model):
            self.model.eval()
            outputs = self.model(
                batch_tokens.long().to(device=self._model_device),
                attention_mask=attn_mask.to(device=self._model_device))
            log_probs = outputs[0]

        return log_probs.cpu(), output_tokens_list, masked_indices_list
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments,
         ArchitectureArguments, CustomOthersArguments))

    (model_args, data_args, training_args, arch_args,
     custom_args) = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    train_files = list(
        sorted(glob.glob(f'{data_args.train_dir}/*.{custom_args.ext}')))
    validation_files = list(
        sorted(glob.glob(f'{data_args.eval_dir}/*.{custom_args.ext}')))
    if len(train_files) > 1:
        logger.warning(
            f'Got {len(train_files)} train files, only pick first file.')
        train_files = train_files[:1]
    if len(validation_files) > 1:
        logger.warning(
            f'Got {len(validation_files)} validation files, only pick first file.'
        )
        validation_files = validation_files[:1]

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    # Create config for LM model
    if model_args.tokenizer_type == 'ThaiRobertaTokenizer':
        tokenizer = ThaiRobertaTokenizer.from_pretrained(
            model_args.tokenizer_name_or_path,
            use_fast=model_args.use_fast_tokenizer)
    elif model_args.tokenizer_type == 'ThaiWordsNewmmTokenizer':
        tokenizer = ThaiWordsNewmmTokenizer.from_pretrained(
            model_args.tokenizer_name_or_path)
    elif model_args.tokenizer_type == 'ThaiWordsSyllableTokenizer':
        tokenizer = ThaiWordsSyllableTokenizer.from_pretrained(
            model_args.tokenizer_name_or_path)
    elif model_args.tokenizer_type == 'FakeSefrCutTokenizer':
        tokenizer = FakeSefrCutTokenizer.from_pretrained(
            model_args.tokenizer_name_or_path)
    else:
        raise NotImplementedError(
            f'tokenizer_type {model_args.tokenizer_type} is not implemeted.')

    if custom_args.ext == 'txt':
        if len(train_files) > 1 or len(validation_files) > 1:
            raise NotImplementedError('only one txt file support for now')
        if data_args.datasets_type == 'MemmapLineByLineTextDataset':
            datasets = {
                'train':
                MemmapLineByLineTextDataset(
                    tokenizer, train_files[0], data_args.max_seq_length,
                    os.path.join(data_args.datasets_cache_dir, 'train'),
                    custom_args.tokenize_chunksize, data_args.overwrite_cache),
                'validation':
                MemmapLineByLineTextDataset(
                    tokenizer, validation_files[0], data_args.max_seq_length,
                    os.path.join(data_args.datasets_cache_dir, 'validation'),
                    custom_args.tokenize_chunksize, data_args.overwrite_cache)
            }
        elif data_args.datasets_type == 'MemmapConcatFullSentenceTextDataset':
            datasets = {
                'train':
                MemmapConcatFullSentenceTextDataset(
                    tokenizer, train_files[0], data_args.max_seq_length,
                    os.path.join(data_args.datasets_cache_dir, 'train'),
                    custom_args.tokenize_chunksize, data_args.overwrite_cache),
                'validation':
                PaddedDataset(
                    MemmapConcatFullSentenceTextDataset(
                        tokenizer, validation_files[0],
                        data_args.max_seq_length,
                        os.path.join(data_args.datasets_cache_dir,
                                     'validation'),
                        custom_args.tokenize_chunksize,
                        data_args.overwrite_cache), tokenizer.pad_token_id,
                    data_args.max_seq_length)
            }
        else:
            raise NotImplementedError(
                f'No specified datasets type {data_args.datasets_type}')
    else:
        raise NotImplementedError(f'not supprt {custom_args.ext},'
                                  f'but this should be possible to support.')

    if custom_args.build_dataset_only:
        return

    ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
        "roberta-base": "../roberta_config/th-roberta-base-config.json",
        "roberta-large": "../roberta_config/th-roberta-large-config.json",
    }

    config = AutoConfig.from_pretrained(
        pretrained_model_name_or_path=ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP[
            arch_args.architecture],
        vocab_size=tokenizer.vocab_size)

    # Some sanity check
    tokenizer_and_model_config_mismatch(config, tokenizer)
    block_size_exceed_max_position_embeddings(config, data_args.max_seq_length)

    # Initialize model
    model = RobertaForMaskedLM(config=config)

    if custom_args.model_dir is not None:
        model_path = os.path.join(custom_args.model_dir, 'pytorch_model.bin')
        logger.info(
            f'[INFO] Load pretrianed model (state_dict) from {model_path}')
        # Use strict=False to kept model compatible with older version,
        # so we can bumb transformers version up and use new datasets library
        # see this issues https://github.com/huggingface/transformers/issues/6882
        # The program itself will run but does it has any side effect?
        # Maybe bad idea?
        try:
            model.load_state_dict(state_dict=torch.load(model_path))
        except RuntimeError:
            logger.info(
                '[INFO] RuntimeError, try loading with strict=False instead.')
            model.load_state_dict(state_dict=torch.load(model_path),
                                  strict=False)
        # If we did not add strict=False, this will raise Error since the keys are not match
        # RuntimeError: Error(s) in loading state_dict for RobertaForMaskedLM:
        #     Missing key(s) in state_dict: "roberta.embeddings.position_ids".
        #     Unexpected key(s) in state_dict: "roberta.pooler.dense.weight",
        # "roberta.pooler.dense.bias".

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)

    # Initialize trainer
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=datasets["train"],
                      eval_dataset=datasets["validation"],
                      data_collator=data_collator)

    # Training
    if custom_args.model_dir is not None:
        trainer.train(model_path=custom_args.model_dir)
    else:
        trainer.train()

    # save
    output_model_dir = os.path.join(training_args.output_dir, 'roberta_thai')
    logging.info(" Save final model to '%s'.", output_model_dir)
    trainer.save_model(output_model_dir)

    if trainer.is_world_process_zero():
        output_tokenizer_dir = os.path.join(training_args.output_dir,
                                            'roberta_thai_tokenizer')
        tokenizer.save_pretrained(output_tokenizer_dir)

    # evaluate
    trainer.evaluate()