Ejemplo n.º 1
0
    def __init__(self, batch_size, epoch_num, model_name, is_test):
        self.BATCH_SIZE = batch_size
        self.EPOCHS = epoch_num
        self.NUM_LABELS = 4
        self.model_name = model_name

        if self.model_name == "bert":
            self.model_version = 'bert-base-cased'
            self.tokenizer = BertTokenizer.from_pretrained(self.model_version)
            if is_test:
                self.model = BertForSequenceClassification.from_pretrained(
                    model_name + "_model", num_labels=self.NUM_LABELS)
            else:
                self.model = BertForSequenceClassification.from_pretrained(
                    self.model_version, num_labels=self.NUM_LABELS)
        elif self.model_name == "robert":
            self.model_version = 'roberta-base'
            self.tokenizer = RobertaTokenizer.from_pretrained(
                self.model_version)
            if is_test:
                self.model = RobertaForSequenceClassification.from_pretrained(
                    model_name + "_model", num_labels=self.NUM_LABELS)
            else:
                self.model = RobertaForSequenceClassification.from_pretrained(
                    self.model_version, num_labels=self.NUM_LABELS)
        elif self.model_name == "albert":
            self.model_version = 'albert-base-v2'
            self.tokenizer = AlbertTokenizer.from_pretrained(
                self.model_version)
            if is_test:
                self.model = AlbertForSequenceClassification.from_pretrained(
                    model_name + "_model", num_labels=self.NUM_LABELS)
            else:
                self.model = AlbertForSequenceClassification.from_pretrained(
                    self.model_version, num_labels=self.NUM_LABELS)

        if is_test:
            self.testset = FakeNewsDataset("test", tokenizer=self.tokenizer)
            self.testloader = DataLoader(self.testset,
                                         batch_size=self.BATCH_SIZE,
                                         collate_fn=create_mini_batch)
        else:
            self.trainset = FakeNewsDataset("train", tokenizer=self.tokenizer)
            self.trainloader = DataLoader(self.trainset,
                                          batch_size=self.BATCH_SIZE,
                                          collate_fn=create_mini_batch)
            self.model.train()
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-5)

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
Ejemplo n.º 2
0
def Get_Model(modelName):
    model = ''
    if modelName == 'XLNet':
        model = XLNetForSequenceClassification.from_pretrained(
            # Use the 12-layer BERT model, with an uncased vocab.
            pretrained_model_path,
            # The number of output labels--2 for binary classification.
            num_labels=2)
    elif modelName == 'BERT':
        model = BertForSequenceClassification.from_pretrained(
            # Use the 12-layer BERT model, with an uncased vocab.
            pretrained_model_path,
            # The number of output labels--2 for binary classification.
            num_labels=2)
    elif modelName == 'RoBerta':
        model = RobertaForSequenceClassification.from_pretrained(
            # Use the 12-layer BERT model, with an uncased vocab.
            pretrained_model_path,
            # The number of output labels--2 for binary classification.
            num_labels=2)
    elif modelName == 'Albert':
        model = AlbertForSequenceClassification.from_pretrained(
            # Use the 12-layer BERT model, with an uncased vocab.
            pretrained_model_path,
            # The number of output labels--2 for binary classification.
            num_labels=2)
    return model
def main():

    bert_base_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2)
    bert_base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=bert_base_config)
    count = 0
    for name, param in bert_base_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in bert_base_uncased: ', count)

    roberta_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2)
    roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=roberta_config)
    count = 0
    for name, param in roberta_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in roberta: ', count)

    albert_config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2)
    albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=albert_config)
    count = 0
    for name, param in albert_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in albert: ', count)
Ejemplo n.º 4
0
 def __init__(self):
     super(AlbertModel, self).__init__()
     self.albert = AlbertForSequenceClassification.from_pretrained(
         "voidful/albert_chinese_base", num_labels=2)  # /bert_pretrain/
     self.device = torch.device("cuda")
     for param in self.albert.parameters():
         param.requires_grad = True  # 每个参数都要 求梯度
Ejemplo n.º 5
0
    def __init__(self, model_name, model_type):
        """
        Hyper-parameters found with validation set:
        xlnet-large-casd : epoch = 4,  learning_rate = 1E-5, batch_size = 16, epsilon = 1e-6
        bert-large-uncased : epoch = 4,  learning_rate = 3E-5, batch_size = 16, epsilon = 1e-8
        ALBERT xxlarge-v2 large : epoch = 3,  learning_rate = 5E-5, batch_size = 8, epsilon = 1e-6 to be improved...
        """
        self.model_name = model_name
        self.model_type = model_type

        # Cf transformers library, batch of 16 or 32 is advised for training. For memory issues, we will take 16. Gradient accumulation step has not lead
        # to great improvment and therefore won't be used here.
        if model_type == 'albert':
            self.batch_size = 8
        else:
            self.batch_size = 16

        available_model_name = ["xlnet-large-cased", "bert-large-uncased", "albert-xlarge-v2"]
        available_model_type = ["bert", "xlnet", "albert"]

        if self.model_name not in available_model_name:
            raise Exception("Error : model_name should be in", available_model_name)
        if self.model_type not in available_model_type:
            raise Exception("Error : model_name should be in", available_model_type)

        # Load BertForSequenceClassification, the pretrained BERT model with a single linear regression layer on top of the pooled output
        # Load our fined tune model: ex: BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
        if self.model_type == 'bert':
            self.config = BertConfig.from_pretrained(self.model_name, num_labels=1)  # num_labels=1 for regression task
            self.model = BertForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        elif self.model_type == 'xlnet':
            self.config = XLNetConfig.from_pretrained(self.model_name, num_labels=1)
            self.model = XLNetForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        elif self.model_type == 'albert':
            self.config = AlbertConfig.from_pretrained(self.model_name, num_labels=1)
            self.model = AlbertForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        self.model.cuda()

        if self.model_name == 'xlnet-large-cased':
            self.epochs = 4
            self.lr = 1e-5
            self.eps = 1e-6

        elif self.model_name == 'bert-large-uncased':
            self.epochs = 4
            self.lr = 3e-5
            self.eps = 1e-8

        elif self.model_name == 'albert-xxlarge-v2':
            self.epochs = 3
            self.lr = 5e-5
            self.eps = 1e-6

        self.max_grad_norm = 1.0  # Gradient threshold, gradients norms that exceed this threshold are scaled down to match the norm.

        self.optimizer = AdamW(self.model.parameters(), lr=self.lr, eps=self.eps)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.n_gpu = torch.cuda.device_count()
        torch.cuda.get_device_name(0)
Ejemplo n.º 6
0
 def create_and_check_albert_for_sequence_classification(
         self, config, input_ids, token_type_ids, input_mask,
         sequence_labels, token_labels, choice_labels):
     config.num_labels = self.num_labels
     model = AlbertForSequenceClassification(config)
     model.eval()
     loss, logits = model(input_ids,
                          attention_mask=input_mask,
                          token_type_ids=token_type_ids,
                          labels=sequence_labels)
     result = {
         "loss": loss,
         "logits": logits,
     }
     self.parent.assertListEqual(list(result["logits"].size()),
                                 [self.batch_size, self.num_labels])
     self.check_loss_output(result)
 def __init__(self, path='model', model_type='albert-base-v2'):
     self.path = path
     self.model_type = model_type
     self.tokenizer = AlbertTokenizer.from_pretrained(self.model_type, do_lower_case=True)
     self.model = AlbertForSequenceClassification.from_pretrained(self.path)
     self.device = "cpu"
     self.model.to(self.device)
     self.model.eval()
Ejemplo n.º 8
0
def albert_trainer():
    # load the dataset and metric
    dataset = load_dataset("glue", 'mnli')
    metric = load_metric('glue', 'mnli')

    # load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained('albert-base-v2', use_fast=True)

    # define a pretrain method
    def preprocess_function(examples):
        return tokenizer(examples["premise"],
                         examples["hypothesis"],
                         truncation=True)

    # preprocess the data
    encoded_dataset = dataset.map(preprocess_function, batched=True)

    # load the model
    model = AlbertForSequenceClassification.from_pretrained("albert-base-v2",
                                                            num_labels=3)

    # set all the training parameter
    batch_size = 16
    args = TrainingArguments(
        "test-glue",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
    )

    # define a metric function
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        return metric.compute(predictions=predictions, references=labels)

    # initialize trainer
    trainer = Trainer(model,
                      args,
                      train_dataset=encoded_dataset["train"],
                      eval_dataset=encoded_dataset['validation_matched'],
                      tokenizer=tokenizer,
                      compute_metrics=compute_metrics)

    # train
    trainer.train()

    # evaluate
    result = trainer.evaluate()

    # print the result
    print(result)
def predict(load_path,
            file_path='./data/Task2.predict.csv',
            save_path='./data/Task2.predict.result.csv',
            **kwargs):
    with open(os.path.join(load_path, 'config.pkl'), 'rb') as f:
        config = pickle.load(f)

    config.update(**kwargs)
    config.load_path = load_path

    if config.device == 'cuda' and torch.cuda.is_available():
        torch.cuda.set_device(config.gpu)
    else:
        config.device = 'cpu'

    data_texts = read_prediction_data(file_path)

    if len(data_texts) == 0:
        save_prediction_result([], file_path, save_path)
        return

    # Load Bert Tokenizer
    if config.bert_model == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    elif config.bert_model == 'albert':
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1')
    else:
        raise Exception('Error Bert model.')

    data_encodings = tokenizer(data_texts, truncation=True, padding=True)
    data_dataset = LicenseDataset(data_encodings)
    data_loader = DataLoader(data_dataset,
                             batch_size=config.batch_size,
                             shuffle=False)

    if config.bert_model == 'bert':
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased')
    elif config.bert_model == 'albert':
        model = AlbertForSequenceClassification.from_pretrained(
            'albert-base-v1')
    else:
        raise Exception('Error Bert model.')

    if config.load_path:
        model = load_model(model, path=config.load_path, name=config.ckpt_name)
    model.to(config.device)
    model.eval()

    pred = []
    for i, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(config.device)
        attention_mask = batch['attention_mask'].to(config.device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        pred.extend(torch.argmax(logits, dim=1).tolist())

    save_prediction_result(pred, file_path, save_path)
Ejemplo n.º 10
0
 def __init__(self, path='output', model_type='albert-base-v2'):
     self.path = path
     self.model_type = model_type
     self.tokenizer = AlbertTokenizer.from_pretrained(self.model_type,
                                                      do_lower_case=True)
     self.model = AlbertForSequenceClassification.from_pretrained(self.path)
     self.device = "cuda" if torch.cuda.is_available() else "cpu"
     self.model.to(self.device)
     self.model.eval()
Ejemplo n.º 11
0
 def __init__(self, requires_grad=True):
     super(AlbertModel, self).__init__()
     self.albert = AlbertForSequenceClassification.from_pretrained(
         'albert-xxlarge-v2', num_labels=2)
     self.tokenizer = AutoTokenizer.from_pretrained('albert-xxlarge-v2',
                                                    do_lower_case=True)
     self.requires_grad = requires_grad
     self.device = torch.device("cuda")
     for param in self.albert.parameters():
         param.requires_grad = True  # Each parameter requires gradient
Ejemplo n.º 12
0
def apply_rules(dataset_path, model_path):
    data = np.load(dataset_path, allow_pickle=True)

    test_data = [separate_answers(x[0]) for x in data if int(x[1]) == 0]
    top_rules = np.load("final_rules.npy", allow_pickle=True)
    tr2 = replace_rules.TextToReplaceRules(nlp, [x[1] for x in test_data], [], min_freq=0.005,
                                           min_flip=0.005, ngram_size=2)
    # Own model
    model = AlbertForSequenceClassification.from_pretrained(pretrained_weights, num_labels=3)
    model.load_state_dict(torch.load(model_path))
    model.cuda()
    model.eval()

    tokenized_stud_ans = tokenizer.tokenize([x[1] for x in test_data])
    model_preds = {}
    rule_flip_amount = {}
    data_id_flipped = {}
    a = time.time()
    for rule in top_rules:
        idxs = list(tr2.get_rule_idxs(rule))
        to_apply = [tokenized_stud_ans[x] for x in idxs]
        applies, nt = rule.apply_to_texts(to_apply, fix_apostrophe=False)
        # Find indices, where rule has been applied
        applies = [idxs[x] for x in applies]
        to_compute = [x for x in zip(applies, nt) if x[1] not in model_preds]
        if to_compute:
            # New predicts
            new_labels = []
            for compute in to_compute:
                j, new_stud = compute
                # Get reference answer for sequence classification
                orig_instance = test_data[j]
                logits = predict(model, orig_instance[0], new_stud, 0)
                new_label = int(np.argmax(logits))
                new_labels.append(new_label)
            for x, y in zip(to_compute, new_labels):
                model_preds[x[1]] = y

        new_labels = np.array([model_preds[x] for x in nt])
        where_flipped = np.where(new_labels == 2)[0]
        flips = sorted([applies[x] for x in where_flipped])
        rule_flip_amount[rule.hash()] = len(flips)
        data_id_flipped[rule.hash()] = list(where_flipped)

        #print("Done with " + rule.hash())

    # Top 10 rules
    top_10 = [x.replace("text_", "").replace("pos_", "") for x in
              list({k: v for k, v in sorted(rule_flip_amount.items(), key=lambda item: item[1], reverse=True)})[:10]]
    np.save(model_path[:model_path.rfind("/") + 1] + "top_10.npy", top_10)
    print("Time used for applying rules: ", time.time() - a)
    print("Total amount of adversaries:", sum(list(rule_flip_amount.values())))
    print("Total amount of afflicted data instances:",
          len(set(np.concatenate(list(data_id_flipped.values())).ravel().tolist())))
Ejemplo n.º 13
0
    def initialize(self, ctx):
        self.manifest = ctx.manifest
        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        serialized_file = self.manifest['model']['serializedFile']
        model_pt_path = os.path.join(model_dir, serialized_file)
        self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu")
        # read configs for the mode, model_name, etc. from setup_config.json
        setup_config_path = os.path.join(model_dir, "setup_config.json")
        if os.path.isfile(setup_config_path):
            with open(setup_config_path) as setup_config_file:
                self.setup_config = json.load(setup_config_file)
        else:
            logger.warning('Missing the setup_config.json file.')

        # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode
        # further setup config can be added.
        if self.setup_config["save_mode"] == "torchscript":
            self.model = torch.jit.load(model_pt_path)
        elif self.setup_config["save_mode"] == "pretrained":
            if self.setup_config["mode"] == "sequence_classification":
                self.model = AlbertForSequenceClassification.from_pretrained(model_dir)
            # elif self.setup_config["mode"]== "question_answering":
            #     self.model = AutoModelForQuestionAnswering.from_pretrained(model_dir)
            # elif self.setup_config["mode"]== "token_classification":
            #     self.model = AutoModelForTokenClassification.from_pretrained(model_dir)
            else:
                logger.warning('Missing the operation mode.')
        else:
            logger.warning('Missing the checkpoint or state_dict.')

        # if not os.path.isfile(os.path.join(model_dir, "vocab.*")):
        #     self.tokenizer = BertTokenizer.from_pretrained(self.setup_config["model_name"],
        #                                                    do_lower_case=self.setup_config["do_lower_case"])
        # else:
        self.tokenizer = BertTokenizer.from_pretrained(model_dir, do_lower_case=self.setup_config["do_lower_case"])

        self.model.to(self.device)
        self.model.eval()

        logger.debug('Transformer model from path {0} loaded successfully'.format(model_dir))

        # Read the mapping file, index to object name
        mapping_file_path = os.path.join(model_dir, "index_to_name.json")
        # Question answering does not need the index_to_name.json file.
        if not self.setup_config["mode"] == "question_answering":
            if os.path.isfile(mapping_file_path):
                with open(mapping_file_path) as f:
                    self.mapping = json.load(f)
            else:
                logger.warning('Missing the index_to_name.json file.')

        self.initialized = True
Ejemplo n.º 14
0
 def __init__(self, requires_grad=True, num_labels=2):
     super(AlbertModel, self).__init__()
     self.num_labels = num_labels
     self.albert = AlbertForSequenceClassification.from_pretrained(
         'voidful/albert_chinese_base', num_labels=self.num_labels)
     self.tokenizer = BertTokenizer.from_pretrained(
         'voidful/albert_chinese_base', do_lower_case=True)
     # self.albert = AlbertForSequenceClassification.from_pretrained('albert-xxlarge-v2', num_labels = self.num_labels)
     # self.tokenizer = AutoTokenizer.from_pretrained('albert-xxlarge-v2', do_lower_case=True)
     self.requires_grad = requires_grad
     self.device = torch.device("cuda")
     for param in self.albert.parameters():
         param.requires_grad = True  # 每个参数都要求梯度
Ejemplo n.º 15
0
def pick_model(model_name, num_labels):
    """
        Return specified model:
        Available model names:
        ['albert-base-v2'\
          , 'bert-base-uncased', 'bert-large-uncased'\
          , 'roberta-base', 'xlnet-base-cased',  ]
    """
    if model_name == 'albert-base-v2':
        model = AlbertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            output_attentions=
            False,  # Whether the model returns attentions weights.
            output_hidden_states=
            False,  # Whether the model returns all hidden-states.
        )
    if model_name in ('bert-base-uncased', 'bert-large-uncased'):
        model = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            output_attentions=
            False,  # Whether the model returns attentions weights.
            output_hidden_states=
            False,  # Whether the model returns all hidden-states.
        )
    if model_name in ('roberta-base', "roberta-large", "roberta-large-mnli"):
        model = RobertaForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            output_attentions=
            False,  # Whether the model returns attentions weights.
            output_hidden_states=
            False,  # Whether the model returns all hidden-states.
        )
    if model_name == 'xlnet-base-cased':
        model = XLNetForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            output_attentions=
            False,  # Whether the model returns attentions weights.
            output_hidden_states=
            False,  # Whether the model returns all hidden-states.
        )

    print(f'Loaded {model_name} model.')
    if torch.cuda.is_available():
        model.cuda()

    return model
 def create_and_check_for_sequence_classification(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     config.num_labels = self.num_labels
     model = AlbertForSequenceClassification(config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
Ejemplo n.º 17
0
def define_model_albert(cuda, epochs_steps, learn, length):
    from transformers import AlbertForSequenceClassification
    model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
    if cuda == 1:
        model.cuda()
    optimizer = AdamW(model.parameters(), lr=learn, eps=1e-8)
    from transformers import get_linear_schedule_with_warmup
    # Number of training epochs (authors recommend between 2 and 4)
    # Total number of training steps is number of batches * number of epochs.
    total_steps = length * epochs_steps
    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,  # Default value in run_glue.py
        num_training_steps=total_steps)
    return model, scheduler, optimizer
Ejemplo n.º 18
0
def model_setting(model_name):
    if model_name == 'bert':
        from transformers import AutoTokenizer, BertForSequenceClassification, BertConfig
        config = BertConfig.from_pretrained("bert-base-uncased", num_labels=2)
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased")
        return config, tokenizer, model

    elif model_name == 'albert':
        from transformers import AutoTokenizer, AlbertForSequenceClassification, AlbertConfig
        config = AlbertConfig.from_pretrained("albert-base-v2", num_labels=2)
        tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
        model = AlbertForSequenceClassification.from_pretrained(
            "albert-base-v2")
        return config, tokenizer, model
Ejemplo n.º 19
0
def main(model_name_or_path, args):
    config_class, model_class, tokenizer_class = (AlbertConfig, AlbertModel, AlbertTokenizer)

    config = load_from_file_or_base(config_class, model_name_or_path)
    model = load_from_file_or_base(model_class, model_name_or_path)
    tokenizer = load_from_file_or_base(tokenizer_class, model_name_or_path)

    if args.model_embedding_visual:
        wordembeddings = model.embeddings.word_embeddings
        # wordembeddings = model.albert.get_input_embeddings()
        logger.info(wordembeddings)

        vocab_word2id = tokenizer.get_vocab()
        word_lookup = {id: key for key, id in vocab_word2id.items()}

        n_samples = 1000
        sampled_ids = torch.randint(
            high=len(vocab_word2id.values()), size=(1, n_samples), dtype=torch.long).flatten()
        logger.info("Sampled ids shape", sampled_ids.shape)
        embedded_vectors = wordembeddings(sampled_ids)

        from sklearn.manifold import TSNE
        embedded_reduced = TSNE(n_components=2).fit_transform(embedded_vectors.detach().numpy())
        logger.info(embedded_reduced)

        names = []
        for sid in sampled_ids:
            names.append(word_lookup[int(sid.int())])
        plot_reduced_space(embedded_reduced, names=names)
    elif args.model_sentence_order:
        text_a = "It is absolutely necessary that all experiments should be recorded in detail during, " \
                 "or immediately after, their performance ..."
        text_b = "The more scientific the record is, the better."
        # Sadly SOP task is not implemented in Huggingface Transformers at the moment:
        # https://github.com/huggingface/transformers/issues/2671
        # But there is a workaround using BERTs NSP, which in the wild will mostly be used
        # for sentence similarity queries.
        # The folloging issue nicly illustrates how to do that:
        # https://github.com/huggingface/transformers/issues/876
        # If we had finetuned this model one could query with the following code,
        # but without finetuning, you'll always get random results:
        model = AlbertForSequenceClassification.from_pretrained(model_name_or_path)
        tokenizer = AlbertTokenizer.from_pretrained(model_name_or_path)
        inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, return_tensors='pt')
        pred = model(inputs['input_ids'], token_type_ids=inputs['token_type_ids'])
        print(pred)
Ejemplo n.º 20
0
def model_fn(model_path='model_path'):

    dout  = 0.1
    model = AlbertForSequenceClassification.from_pretrained( model_type,
                                                             num_labels=2,
                                                             output_attentions=False,
                                                             output_hidden_states=False,
                                                             attention_probs_dropout_prob=dout,
                                                             hidden_dropout_prob=dout,
                                                            )
    model.to(DEVICE)
    if DEVICE == 'cuda':
        model.load_state_dict( torch.load( model_path ) )
    else:
        model.load_state_dict( torch.load( model_path, map_location=torch.device('cpu') ) )

    #model.eval()
    return model
Ejemplo n.º 21
0
def main():

    # Initializing a pre-trained ALBERT-base style
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

    # Initialize data iterators
    train_generator = SingleSentenceClassificationProcessor()
    train_generator.add_examples_from_csv(file_name='data/train.tsv',
                                          column_label=1,
                                          column_text=0)
    train_dataset = train_generator.get_features(
        tokenizer=tokenizer)  #, return_tensors='pt')

    eval_generator = SingleSentenceClassificationProcessor()
    eval_generator.add_examples_from_csv(file_name='data/dev.tsv',
                                         column_label=1,
                                         column_text=0)
    eval_dataset = train_generator.get_features(
        tokenizer=tokenizer)  #, return_tensors='pt')

    test_generator = SingleSentenceClassificationProcessor()
    test_generator.add_examples_from_csv(file_name='data/test.tsv',
                                         column_label=1,
                                         column_text=0)
    test_dataset = train_generator.get_features(
        tokenizer=tokenizer)  #, return_tensors='pt')

    model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    trainer.train()

    test_batch = next(iter(test_dataset))
    print(f'Test batch is {test_batch}')
    pred = model(
        torch.tensor(test_batch.input_ids).unsqueeze(0).cuda(),
        torch.tensor(test_batch.label).unsqueeze(0).cuda())
    print(f'Prediction: {pred}')
Ejemplo n.º 22
0
def sentiment_analysis(model_type, data_path):
    if model_type == 'albert':
        model = AlbertForSequenceClassification.from_pretrained(
            "textattack/albert-base-v2-SST-2")
        tokenizer = AlbertTokenizer.from_pretrained(
            "textattack/albert-base-v2-SST-2")

    elif model_type == 'bert':
        model = BertForSequenceClassification.from_pretrained(
            "textattack/bert-base-uncased-SST-2")
        tokenizer = BertTokenizer.from_pretrained(
            "textattack/bert-base-uncased-SST-2")

    elif model_type == 'distil':
        model = DistilBertForSequenceClassification.from_pretrained(
            "textattack/distilbert-base-cased-SST-2")
        tokenizer = DistilBertTokenizer.from_pretrained(
            "textattack/distilbert-base-cased-SST-2")

    elif model_type == 'roberta':
        model = RobertaForSequenceClassification.from_pretrained(
            "textattack/roberta-base-SST-2")
        tokenizer = RobertaTokenizer.from_pretrained(
            "textattack/roberta-base-SST-2")

    nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
    tl = TextLoader(data_path)
    ground_truth = list()
    review_predictions = list()
    label_dict = {'LABEL_0': 0, 'LABEL_1': 1}

    for data in tqdm(tl):
        text = data['text']
        score = data['score']
        score = score >= 2.5
        result = nlp(text, truncation=True)
        prediction = label_dict[result[0]['label']]
        ground_truth.append(score.cpu().numpy())
        review_predictions.append(prediction)

    accuracy = accuracy_score(ground_truth, review_predictions)
    print('ACCURACY: ', accuracy)

    return accuracy
    def __init__(self, hyperparams):
        """
        :param hyperparams: list of paranters
        :type hyperparams: dict

        pretrained_weights in ['albert-base-v1',
                               'albert-base-v2']
        more in https://huggingface.co/transformers/pretrained_models.html
        """
        set_seed(hyperparams["random_state"], hyperparams["n_gpu"])

        pretrained_weights = hyperparams['pretrained_weights']
        self.tokenizer = AlbertTokenizer.from_pretrained(pretrained_weights)

        hyperparams["tokenizer"] = self.tokenizer
        self.hyperparams = hyperparams
        self.model = AlbertForSequenceClassification.from_pretrained(
            pretrained_weights, num_labels=3)
        self.processor = NLIProcessor(hyperparams)
Ejemplo n.º 24
0
 def __call_model_torch(self):
     if self.model_to_use.lower() == 'bert':
         self.config = BertConfig(num_labels=2)
         self.model = BertForSequenceClassification.from_pretrained(
             'bert-base-uncased', config=self.config)
     elif self.model_to_use.lower() == 'albert':
         self.config = AlbertConfig(num_labels=2)
         self.model = AlbertForSequenceClassification.from_pretrained(
             'albert-base-v1', config=self.config)
     elif self.model_to_use.lower() == 'electra':
         self.config = ElectraConfig(num_labels=2)
         self.model = ElectraForSequenceClassification.from_pretrained(
             'google/electra-small-discriminator', config=self.config)
     elif self.model_to_use.lower() == 'distilbert':
         self.config = DistilBertConfig(num_labels=2)
         self.model = DistilBertForSequenceClassification.from_pretrained(
             'distilbert-base-uncased', config=self.config)
     else:
         print('Model not avaiable yet.')
Ejemplo n.º 25
0
    def __init__(self, num_classes, max_seq_length, batch_size, model_name,
                 model_path):

        self.num_classes = num_classes
        self.classification_model_dir = model_path
        self.max_seq_length = max_seq_length
        self.predict_batch_size = batch_size
        self.model_name = model_name

        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            print('No GPU available, using the CPU instead.')
            self.device = torch.device("cpu")

        if self.model_name == 'bert':
            self.model = BertForSequenceClassification.from_pretrained(
                self.classification_model_dir, num_labels=self.num_classes)
            self.tokenizer = BertTokenizer.from_pretrained(
                self.classification_model_dir)

        if self.model_name == 'albert':
            self.model = AlbertForSequenceClassification.from_pretrained(
                self.classification_model_dir, num_labels=self.num_classes)
            self.tokenizer = AlbertTokenizer.from_pretrained(
                self.classification_model_dir)

        if self.model_name == 'distilbert':
            self.model = DistilBertForSequenceClassification.from_pretrained(
                self.classification_model_dir, num_labels=self.num_classes)
            self.tokenizer = DistilBertTokenizer.from_pretrained(
                self.classification_model_dir)

        if self.model_name == 'roberta':
            self.model = RobertaForSequenceClassification.from_pretrained(
                self.classification_model_dir, num_labels=self.num_classes)
            self.tokenizer = RobertaTokenizer.from_pretrained(
                self.classification_model_dir)

        if torch.cuda.is_available():
            self.model.cuda()
Ejemplo n.º 26
0
def main():
    #
    blockPrint()

    # setting device
    device = torch.device('cuda')

    #
    FullData = MR_Data.load_data('dataset/test.tsv', is_train_data=False)
    FullDataset = makeTorchDataSet(FullData, is_train_data=False)
    TestDataLoader = makeTorchDataLoader(FullDataset, batch_size=16)
    model_config = AlbertConfig.from_json_file(
        'model/albert-large-config.json')
    trained_model_file = '12-11-2019_09-17-05_ALSS_e5_a69.24226892192033'
    model = AlbertForSequenceClassification.from_pretrained(
        'train_models/' + trained_model_file + '/pytorch_model.bin',
        config=model_config)

    model.to(device)
    model.eval()

    f = open('submission.csv', 'w', encoding='utf-8')
    f.write('PhraseId,Sentiment\n')
    log("please waiting for predict ....")
    for batch_index, batch_dict in enumerate(TestDataLoader):
        batch_dict = tuple(t.to(device) for t in batch_dict)
        input_ids, phrase_ids = batch_dict
        outputs = model(input_ids)

        outputs = outputs[0].cpu()
        outputs = outputs.detach().numpy()
        # log(outputs)

        for i in range(len(outputs)):
            p_id = phrase_ids[i].item()
            s_level = np.argmax(outputs[i])
            # log("phrase_id",p_id,"segment_level",s_level)
            f.write(str(p_id) + ',' + str(s_level) + '\n')

    f.close()
Ejemplo n.º 27
0
 def call(self):
     if self.model_to_use.lower() == 'bert':
         self.model = BertForSequenceClassification.from_pretrained(
             'bert-base-uncased',
             num_labels=2,
             output_attentions=False,
             output_hidden_states=False)
         print('Bert Cargado.')
         print(self.model)
     elif self.model_to_use.lower() == 'albert':
         self.model = AlbertForSequenceClassification.from_pretrained(
             'albert-base-v1',
             num_labels=2,
             output_attentions=False,
             output_hidden_states=False)
     elif self.model_to_use.lower() == 'electra':
         self.model = ElectraForSequenceClassification.from_pretrained(
             'google/electra-small-discriminator',
             num_labels=2,
             output_attentions=False,
             output_hidden_states=False)
     elif self.model_to_use.lower() == 'distilbert':
         self.model = DistilBertForSequenceClassification.from_pretrained(
             'distilbert-base-uncased',
             num_labels=2,
             output_attentions=False,
             output_hidden_states=False)
     else:
         print('Model not avaiable right now.')
     self.model.to(self.device)
     self.optimizer = AdamW(self.model.parameters(),
                            lr=self.learning_rate,
                            eps=self.epsilon)
     self.total_steps = len(self.train_dataloader) * self.epochs
     self.scheduler = get_linear_schedule_with_warmup(
         self.optimizer,
         num_warmup_steps=0,
         num_training_steps=self.total_steps)
Ejemplo n.º 28
0
def evaluate(loader, model_dir, ckpt, num_labels):
    loss = 0.0
    nb_eval_steps = 0
    y_pred = None
    model = AlbertForSequenceClassification.from_pretrained(
        model_dir, num_labels=num_labels)
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(os.path.join(model_dir, ckpt)))
    model = model.cuda()

    for batch in tqdm(loader, desc="Evaluating"):
        model.eval()
        inp_ids, seg_ids, inp_masks, labels = batch
        inp_ids = inp_ids.cuda()
        seg_ids = seg_ids.cuda()
        inp_masks = inp_masks.cuda()
        labels = labels.cuda()
        with torch.no_grad():
            tmp_loss, logits = model(inp_ids,
                                     seg_ids,
                                     inp_masks,
                                     labels=labels)
            loss += tmp_loss.mean().item()
            nb_eval_steps += 1

            if y_pred is None:
                y_pred = np.argmax(logits.detach().cpu().numpy(), axis=1)
                y_true = labels.detach().cpu().numpy()
            else:
                y_pred = np.append(
                    y_pred, np.argmax(logits.detach().cpu().numpy(), axis=1))
                y_true = np.append(y_true, labels.detach().cpu().numpy())

    loss = loss / nb_eval_steps
    acc = precision_score(y_true, y_pred, average='weighted')
    print(f"test_acc: {acc}\ttest_loss: {loss}")
    return acc, loss
Ejemplo n.º 29
0
def get_sim_model(config_file, pre_train_model, label_num=2):
    bert_config = BertConfig.from_pretrained(config_file)
    bert_config.num_labels = label_num
    model = AlbertForSequenceClassification(bert_config)
    model.load_state_dict(torch.load(pre_train_model))
    return model
Ejemplo n.º 30
0
def main(args):
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    assert args.model in ["albert-classifier", "attn-lstm", "cond-attn-lstm"]
    if args.model == "albert-classifier":
        train_dataloader, validation_dataloader = get_dataloader_ALBERT(
            tokenizer, args.data_file, args.batch, args.max_len)
        classifier = AlbertForSequenceClassification.from_pretrained(
            'albert-base-v2')
        classifier.config.classifier_dropout_prob = 0.1
        no_decay = ['bias', 'LayerNorm.weight']

        optimizer_grouped_parameters = [
            # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
            {
                'params': [
                    p for n, p in classifier.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay_rate':
                args.wd
            },

            # Filter for parameters which *do* include those.
            {
                'params': [
                    p for n, p in classifier.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay_rate':
                0.0
            }
        ]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.lr,
                          eps=args.adam_eps)
        # optimizer = AdamW(classifier.parameters(), lr = LR, eps = EPS)
        total_steps = len(train_dataloader) * args.epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,  # Default value in run_glue.py
            num_training_steps=total_steps)
        # weight decay and clip_grad_norm 10K data LR = 1e-5 WD = 1e-4 BATCH=32 by setting accummulate = 2
        classifier, history = AlbertTrainer(classifier,
                                            optimizer,
                                            scheduler,
                                            args.epochs,
                                            args.early_stop,
                                            train_dataloader,
                                            validation_dataloader,
                                            accumulation_steps=args.accumulate)

    else:
        if args.model == "attn-lstm":
            train_dataloader, validation_dataloader = get_dataloader_LSTM(
                tokenizer, args.data_file, args.batch, args.max_len)
            classifier = PairAttnLSTM(embedding_dim=768,
                                      hidden_dim=args.d_hid,
                                      num_layers=args.n_layer,
                                      label_size=args.n_label)
            optimizer_grouped_parameters = [{
                'params': [p for n, p in classifier.parameters()],
                'weight_decay_rate':
                args.wd
            }]

            optimizer = AdamW(classifier.parameters(),
                              lr=args.lr,
                              eps=args.adam_eps)
        else:
            pass

        total_steps = len(train_dataloader) * args.epochs

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,  # Default value in run_glue.py
            num_training_steps=total_steps)
        print("Start training ...")
        print("Max epochs", args.epochs)
        print("Early Stop", args.early_stop)
        print("Batch Size", args.batch)
        print("Accumulate", args.accummulate)
        print("Learning Rate", args.lr)
        print("Weight Decay", args.wd)
        print("Max Sequene Length", args.max_len)
        print("LSTM Hidden Size", args.d_hid)
        print("LSTM Layers", args.n_layer)
        print()
        classifier, history = LSTMTrainer(classifier,
                                          optimizer,
                                          scheduler,
                                          args.epochs,
                                          args.early_stop,
                                          train_dataloader,
                                          validation_dataloader,
                                          accumulation_steps=args.accumulate)