def create_and_check_for_masked_lm(self, config, input_ids, token_type_ids,
                                    input_mask, sequence_labels,
                                    token_labels, choice_labels):
     model = RobertaForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids,
                    attention_mask=input_mask,
                    token_type_ids=token_type_ids,
                    labels=token_labels)
     self.parent.assertEqual(
         result.logits.shape,
         (self.batch_size, self.seq_length, self.vocab_size))
Exemple #2
0
 def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
                                            token_labels, choice_labels):
     model = RobertaForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.check_loss_output(result)
Exemple #3
0
def main():
    random.seed(1012)
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    chars = string.ascii_lowercase
    number_of_entity_trials = 10

    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt'
    # state_dict = torch.load(checkpoint_path)["model"]
    # roberta = RobertaForMaskedLM.from_pretrained('roberta-base', state_dict=state_dict)

    # # Initializing a RoBERTa configuration
    # config = RobertaConfig.from_pretrained('roberta-base')
    # # Initializing a model from the configuration
    # roberta = RobertaForMaskedLM(config)
    # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt'
    # state_dict = torch.load(checkpoint_path)["model"]
    # roberta.load_state_dict(state_dict)

    roberta = HappyROBERTA('roberta-large')

    config = RobertaConfig.from_pretrained('roberta-large')
    mlm = RobertaForMaskedLM(config)
    #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt'
    #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/save_step_230400/checkpoint.pt'
    #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/roberta_base_best_sample_from_sets/checkpoint.pt'
    checkpoint_path = '../data/finetune_data/roberta-large/save_step_57000/checkpoint.pt'
    state_dict = torch.load(checkpoint_path)["model"]
    mlm.load_state_dict(state_dict)
    mlm.eval()

    roberta.mlm = mlm

    fictitious_entities = proc.generate_pairs_of_random_strings(
        number_of_pairs=100, min_length=3, max_length=12, character_set=chars)
    with open("../data/truism_data/physical_data_sentences_2.json", "r") as f:
        physical_sents = json.load(f)

    with open("../data/truism_data/physical_data_2.json", "r") as f:
        physical_config = json.load(f)

    with open("../data/finetune_data/sample_from_sets/test_keys.json",
              "r") as f:
        test_keys = json.load(f)

    phy_filtered = {}
    for key in test_keys['phy']:
        index = key.split("-")[0]
        ling_pert = key.split("-")[1]
        asym_pert = key.split("-")[2]
        if index not in phy_filtered.keys():
            phy_filtered[index] = {}
            phy_filtered[index][ling_pert] = {}
            phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][
                ling_pert][asym_pert]
        elif ling_pert not in phy_filtered[index].keys():
            phy_filtered[index][ling_pert] = {}
            phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][
                ling_pert][asym_pert]
        else:
            phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][
                ling_pert][asym_pert]
    # physical_sents = {k: physical_sents[k] for k in ('11', '16')}
    # physical_config  = {k: physical_config[k] for k in ('11', '16')}

    logger.info("finished reading in physical data")

    output_df = run_pipeline(model=roberta,
                             tokenizer=tokenizer,
                             fictitious_entities=fictitious_entities,
                             sentences=phy_filtered,
                             config=physical_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta/sample_from_set/physical_perf_ft19_new_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical dataset results")

    with open("../data/truism_data/material_data_sentences_2.json", "r") as f:
        material_sents = json.load(f)

    with open("../data/truism_data/material_data_2.json", "r") as f:
        material_config = json.load(f)

    mat_filtered = {}
    for key in test_keys['mat']:
        index = key.split("-")[0]
        ling_pert = key.split("-")[1]
        asym_pert = key.split("-")[2]
        if index not in mat_filtered.keys():
            mat_filtered[index] = {}
            mat_filtered[index][ling_pert] = {}
            mat_filtered[index][ling_pert][asym_pert] = material_sents[index][
                ling_pert][asym_pert]
        elif ling_pert not in mat_filtered[index].keys():
            mat_filtered[index][ling_pert] = {}
            mat_filtered[index][ling_pert][asym_pert] = material_sents[index][
                ling_pert][asym_pert]
        else:
            mat_filtered[index][ling_pert][asym_pert] = material_sents[index][
                ling_pert][asym_pert]

    logger.info("finished reading in material data")

    output_df = run_pipeline(model=roberta,
                             tokenizer=tokenizer,
                             fictitious_entities=fictitious_entities,
                             sentences=mat_filtered,
                             config=material_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta/sample_from_set/material_perf_ft19_new_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical material results")

    with open("../data/truism_data/social_data_sentences_2.json", "r") as f:
        social_sents = json.load(f)

    with open("../data/truism_data/social_data_2.json", "r") as f:
        social_config = json.load(f)

    soc_filtered = {}
    for key in test_keys['soc']:
        index = key.split("-")[0]
        ling_pert = key.split("-")[1]
        asym_pert = key.split("-")[2]
        if index not in soc_filtered.keys():
            soc_filtered[index] = {}
            soc_filtered[index][ling_pert] = {}
            soc_filtered[index][ling_pert][asym_pert] = social_sents[index][
                ling_pert][asym_pert]
        elif ling_pert not in soc_filtered[index].keys():
            soc_filtered[index][ling_pert] = {}
            soc_filtered[index][ling_pert][asym_pert] = social_sents[index][
                ling_pert][asym_pert]
        else:
            soc_filtered[index][ling_pert][asym_pert] = social_sents[index][
                ling_pert][asym_pert]

    logger.info("finished reading in social data")

    output_df = run_pipeline(model=roberta,
                             tokenizer=tokenizer,
                             fictitious_entities=fictitious_entities,
                             sentences=soc_filtered,
                             config=social_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta/sample_from_set/social_perf_ft19_new_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical social results")
Exemple #4
0
class Roberta(object):
    def __init__(self, args):
        # self.dict_file = "{}/{}".format(args.roberta_model_dir, args.roberta_vocab_name)
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        if args.model_path is not None:
            print("Testing CoLAKE...")
            print('loading model parameters from {}...'.format(
                args.model_path))
            config = RobertaConfig.from_pretrained('roberta-base',
                                                   type_vocab_size=3)
            self.model = RobertaForMaskedLM(config=config)
            states_dict = torch.load(os.path.join(args.model_path,
                                                  'model.bin'))
            self.model.load_state_dict(states_dict, strict=False)
        else:
            print("Testing RoBERTa baseline...")
            self.model = RobertaForMaskedLM.from_pretrained('roberta-base')

        self._build_vocab()
        self._init_inverse_vocab()
        self._model_device = 'cpu'
        self.max_sentence_length = args.max_sentence_length

    def _cuda(self):
        self.model.cuda()

    def _build_vocab(self):
        self.vocab = []
        for key in range(len(self.tokenizer)):
            value = self.tokenizer.decode([key])
            if value[0] == " ":  # if the token starts with a whitespace
                value = value.strip()
            else:
                # this is subword information
                value = "_{}_".format(value)

            if value in self.vocab:
                # print("WARNING: token '{}' is already in the vocab".format(value))
                value = "{}_{}".format(value, key)

            self.vocab.append(value)
        print("size of vocabulary: {}".format(len(self.vocab)))

    def _init_inverse_vocab(self):
        self.inverse_vocab = {w: i for i, w in enumerate(self.vocab)}

    def try_cuda(self):
        """Move model to GPU if one is available."""
        if torch.cuda.is_available():
            if self._model_device != 'cuda':
                self._cuda()
                self._model_device = 'cuda'
        else:
            print('No CUDA found')

    def init_indices_for_filter_logprobs(self, vocab_subset):
        index_list = []
        new_vocab_subset = []
        for word in vocab_subset:
            if word in self.inverse_vocab:
                inverse_id = self.inverse_vocab[word]
                index_list.append(inverse_id)
                new_vocab_subset.append(word)
            else:
                msg = "word {} from vocab_subset not in model vocabulary!".format(
                    word)
                print("WARNING: {}".format(msg))

        indices = torch.as_tensor(index_list)
        return indices, index_list

    def filter_logprobs(self, log_probs, indices):
        new_log_probs = log_probs.index_select(dim=2, index=indices)
        return new_log_probs

    def get_id(self, input_string):
        # Roberta predicts ' London' and not 'London'
        string = " " + str(input_string).strip()
        tokens = self.tokenizer.encode(string, add_special_tokens=False)
        # return [element.item() for element in tokens.long().flatten()]
        return tokens

    def get_batch_generation(self, samples_list, try_cuda=True):
        if not samples_list:
            return None
        if try_cuda:
            self.try_cuda()

        tensor_list = []
        masked_indices_list = []
        max_len = 0
        output_tokens_list = []
        seq_len = []
        for sample in samples_list:
            masked_inputs_list = sample["masked_sentences"]

            tokens_list = [self.tokenizer.bos_token_id]

            for idx, masked_input in enumerate(masked_inputs_list):
                tokens_list.extend(
                    self.tokenizer.encode(" " + masked_input.strip(),
                                          add_special_tokens=False))
                tokens_list.append(self.tokenizer.eos_token_id)

            # tokens = torch.cat(tokens_list)[: self.max_sentence_length]
            tokens = torch.tensor(tokens_list)[:self.max_sentence_length]
            output_tokens_list.append(tokens.long().cpu().numpy())

            seq_len.append(len(tokens))
            if len(tokens) > max_len:
                max_len = len(tokens)
            tensor_list.append(tokens)
            masked_index = (
                tokens == self.tokenizer.mask_token_id).nonzero().numpy()
            for x in masked_index:
                masked_indices_list.append([x[0]])
        tokens_list = []
        for tokens in tensor_list:
            pad_lenght = max_len - len(tokens)
            if pad_lenght > 0:
                pad_tensor = torch.full([pad_lenght],
                                        self.tokenizer.pad_token_id,
                                        dtype=torch.int)
                tokens = torch.cat((tokens, pad_tensor.long()))
            tokens_list.append(tokens)

        batch_tokens = torch.stack(tokens_list)
        seq_len = torch.LongTensor(seq_len)
        attn_mask = seq_len_to_mask(seq_len)

        with torch.no_grad():
            # with utils.eval(self.model.model):
            self.model.eval()
            outputs = self.model(
                batch_tokens.long().to(device=self._model_device),
                attention_mask=attn_mask.to(device=self._model_device))
            log_probs = outputs[0]

        return log_probs.cpu(), output_tokens_list, masked_indices_list