コード例 #1
0
 def __init__(self,
              bert_dir: Optional[str],
              pad_token_id: int,
              cls_token_id: int,
              sep_token_id: int,
              num_labels: int,
              max_length: int = 512,
              use_half_precision=False,
              config=Optional[PretrainedConfig]):
     super(BertClassifier, self).__init__()
     if bert_dir is None:
         assert config is not None
         assert config.num_labels == num_labels
         bert = RobertaForSequenceClassification(config)
         #bert = BertForSequenceClassification(config)
     else:
         bert = RobertaForSequenceClassification.from_pretrained(
             bert_dir, num_labels=num_labels)
         #bert = BertForSequenceClassification.from_pretrained(bert_dir, num_labels=num_labels)
     if use_half_precision:
         import apex
         bert = bert.half()
     self.bert = bert
     self.pad_token_id = pad_token_id
     self.cls_token_id = cls_token_id
     self.sep_token_id = sep_token_id
     self.max_length = max_length
コード例 #2
0
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.uniform_prior = args.uniform_prior
        self.entropy_regularize_prior_wt = args.entropy_regularize_prior_wt
        self.use_structured_prior = args.use_structured_prior
        self.use_structured_prior_binarypotential = args.use_structured_prior_binarypotential

        if not self.uniform_prior:
            self.roberta_embeddings = RobertaForSequenceClassification.from_pretrained('roberta-base').roberta.embeddings
            self.hidden_size = RobertaForSequenceClassification.from_pretrained('roberta-base').config.hidden_size
            self.history_tranformation = nn.Linear(self.hidden_size, self.hidden_size)
        else:
            assert not self.entropy_regularize_prior_wt>0. # Doesn't make sense with uniform prior
            assert not self.use_structured_prior

        if self.use_structured_prior_binarypotential:
            assert self.use_structured_prior
        if self.use_structured_prior:
            self.emb_dim = args.effect_emb_dim #5
            self.effect_type_emb = nn.Embedding(len(EFFECTS), self.emb_dim)
            self.effect_type_to_feature = nn.Linear(self.emb_dim,1)
            if self.use_structured_prior_binarypotential:
                self.effecthistory_type_to_feature = nn.Linear(self.emb_dim+self.hidden_size, 1)
            self.num_feats = 2
            if self.use_structured_prior_binarypotential:
                self.num_feats += 1
            self.feature_combiner = nn.Parameter(torch.rand(self.num_feats).to(self.args.device))
コード例 #3
0
def get_model(type='trained', path='../models/yelp'):
    if type == 'trained':
        if os.path.exists(path):
            model = RobertaForSequenceClassification.from_pretrained(path)
        else:
            util_files.util.download_model('yelp')
            model = RobertaForSequenceClassification.from_pretrained(path)
    else:
        RobertaForSequenceClassification.from_pretrained('roberta-base')
    return model
コード例 #4
0
ファイル: models.py プロジェクト: vishalbelsare/OpenMatch
    def __init__(self, config, model_argobj=None):
        NLL.__init__(self, model_argobj)
        RobertaForSequenceClassification.__init__(self, config)
        self.embeddingHead = nn.Linear(config.hidden_size, 768)
        self.norm = nn.LayerNorm(768)

        self.apply(self._init_weights)
        self.sparse_attention_mask_query = None
        self.sparse_attention_mask_document = None

        self.is_representation_l2_normalization = False # switch for L2 normalization after output
        self.is_projection_l2_normalization = False # do l2 normalization on an extra non-linear projection layer
コード例 #5
0
    def __init__(self, batch_size, epoch_num, model_name, is_test):
        self.BATCH_SIZE = batch_size
        self.EPOCHS = epoch_num
        self.NUM_LABELS = 4
        self.model_name = model_name

        if self.model_name == "bert":
            self.model_version = 'bert-base-cased'
            self.tokenizer = BertTokenizer.from_pretrained(self.model_version)
            if is_test:
                self.model = BertForSequenceClassification.from_pretrained(
                    model_name + "_model", num_labels=self.NUM_LABELS)
            else:
                self.model = BertForSequenceClassification.from_pretrained(
                    self.model_version, num_labels=self.NUM_LABELS)
        elif self.model_name == "robert":
            self.model_version = 'roberta-base'
            self.tokenizer = RobertaTokenizer.from_pretrained(
                self.model_version)
            if is_test:
                self.model = RobertaForSequenceClassification.from_pretrained(
                    model_name + "_model", num_labels=self.NUM_LABELS)
            else:
                self.model = RobertaForSequenceClassification.from_pretrained(
                    self.model_version, num_labels=self.NUM_LABELS)
        elif self.model_name == "albert":
            self.model_version = 'albert-base-v2'
            self.tokenizer = AlbertTokenizer.from_pretrained(
                self.model_version)
            if is_test:
                self.model = AlbertForSequenceClassification.from_pretrained(
                    model_name + "_model", num_labels=self.NUM_LABELS)
            else:
                self.model = AlbertForSequenceClassification.from_pretrained(
                    self.model_version, num_labels=self.NUM_LABELS)

        if is_test:
            self.testset = FakeNewsDataset("test", tokenizer=self.tokenizer)
            self.testloader = DataLoader(self.testset,
                                         batch_size=self.BATCH_SIZE,
                                         collate_fn=create_mini_batch)
        else:
            self.trainset = FakeNewsDataset("train", tokenizer=self.tokenizer)
            self.trainloader = DataLoader(self.trainset,
                                          batch_size=self.BATCH_SIZE,
                                          collate_fn=create_mini_batch)
            self.model.train()
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-5)

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
コード例 #6
0
    def test_inference_classification_head(self):
        model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli")

        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 3))
        self.assertEqual(output.shape, expected_shape)
コード例 #7
0
def model_trainer(model_path, train_dataset, test_dataset=None):
    model = RobertaForSequenceClassification.from_pretrained('ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli', num_labels =3, return_dict=True)#ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli


    training_args = TrainingArguments(
    output_dir=model_path,          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    # gradient_accumulation_steps=3,
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir= os.path.join(model_path, 'logs'),            # directory for storing logs
    logging_steps=1200,
    save_steps = 5900, #1200,
    learning_rate = 1e-05
    # save_strategy='epoch'
    )

    if test_dataset != None:
        trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=test_dataset,          # evaluation dataset
        compute_metrics = compute_metrics,
        )
    else:
        trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        compute_metrics = compute_metrics,
        )
    return trainer, model
コード例 #8
0
    def __init__(self, config):
        super(BERTEncoder, self).__init__()
        self.config = config
        if 'bert_pooling' in config.keys():
            self.pooling = config['bert_pooling']
        else:
            self.pooling = 'cls'

        if self.config['use_huggingface_head']:
            if 'roberta' in self.config['model_name']:

                model = RobertaForSequenceClassification.from_pretrained(
                    config['model_name'],
                    cache_dir=config['cache_dir'],
                    output_attentions=config['use_attentions'])
                self.bert = model.roberta
                self.classifier = model.classifier
            else:
                raise NotImplementedError
        else:
            self.bert_config = AutoConfig.from_pretrained(config['model_name'])
            self.bert_config.output_attentions = config['use_attentions']
            self.bert = AutoModel.from_pretrained(
                config['model_name'],
                cache_dir=config['cache_dir'],
                config=self.bert_config)
コード例 #9
0
def load_transformer_model(model_dir):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    config = RobertaConfig.from_json_file('{}/config.json'.format(model_dir))
    model = RobertaForSequenceClassification.from_pretrained(model_dir,
                                                             config=config)
    model = model.to(device)
    return model
コード例 #10
0
 def init_model(self):
     self.model = RobertaForSequenceClassification.from_pretrained(
         COLA_MODEL_KEY)
     self.tokenizer = RobertaTokenizer.from_pretrained(COLA_MODEL_KEY)
     if self.use_cuda:
         self.model.cuda()
     self.model.eval()
コード例 #11
0
def Get_Model(modelName):
    model = ''
    if modelName == 'XLNet':
        model = XLNetForSequenceClassification.from_pretrained(
            # Use the 12-layer BERT model, with an uncased vocab.
            pretrained_model_path,
            # The number of output labels--2 for binary classification.
            num_labels=2)
    elif modelName == 'BERT':
        model = BertForSequenceClassification.from_pretrained(
            # Use the 12-layer BERT model, with an uncased vocab.
            pretrained_model_path,
            # The number of output labels--2 for binary classification.
            num_labels=2)
    elif modelName == 'RoBerta':
        model = RobertaForSequenceClassification.from_pretrained(
            # Use the 12-layer BERT model, with an uncased vocab.
            pretrained_model_path,
            # The number of output labels--2 for binary classification.
            num_labels=2)
    elif modelName == 'Albert':
        model = AlbertForSequenceClassification.from_pretrained(
            # Use the 12-layer BERT model, with an uncased vocab.
            pretrained_model_path,
            # The number of output labels--2 for binary classification.
            num_labels=2)
    return model
コード例 #12
0
ファイル: nli_eval.py プロジェクト: ngarneau/nlgi_eval
    def __init__(self, file_format, use_templates=True, e2e_ignore_restaurant=False):

        self.use_gpu = torch.cuda.is_available()
        logger.debug('Use GPU: %r' % self.use_gpu)

        # load roberta
        logger.debug('Loading models...')
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-large-mnli')
        self.model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
        if self.use_gpu:
            self.model.to('cuda')

        # load templates
        if use_templates:
            logger.debug('Loading templates...')
            with open(TEMPLATE_PATHS[file_format], 'r', encoding='UTF-8') as fh:
                self.templates = json.load(fh)
        else:
            self.templates = {}
        # set parse method
        if file_format == 'webnlg':
            self.parse_data = self.parse_webnlg
            self.check_with_gold = self.check_with_gold_webnlg
            self.e2e_ignore_restaurant = False  # E2E-only setting
        elif file_format == 'e2e':
            self.parse_data = self.parse_e2e
            self.check_with_gold = self.check_with_gold_e2e
            self.e2e_ignore_restaurant = e2e_ignore_restaurant
        logger.debug('Ready.')
コード例 #13
0
 def __init__(self, model_size, args, num_labels=2):
     super(RoBERTa, self).__init__()
     self.model = RobertaForSequenceClassification.from_pretrained(
         f'roberta-{model_size}',
         num_labels=num_labels,
         hidden_dropout_prob=args['hidden_dropout'],
         attention_probs_dropout_prob=args['attention_dropout'])
コード例 #14
0
ファイル: inference.py プロジェクト: supersun1/MasterProject
def model_fn(model_dir):
    #=====================Loading the Model========================
    device = torch.device('cpu')
    print('load the model')

    model_path = os.path.join(model_dir, 'model/')
    print('find cpu')

    
    
    tokenizer = RobertaTokenizer.from_pretrained(model_path)

    
    model_p = os.path.join(model_dir, "model.pth")
    print("get model pth")

    
    model = RobertaForSequenceClassification.from_pretrained(model_path,
                                                          num_labels = 2,
                                                          output_attentions = False,
                                                          output_hidden_states = False
                                                         )
    print("model dir: " + str(model_dir))
    with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:
        model.load_state_dict(torch.load(f, map_location=torch.device('cpu')))

    
    print('finished reading pth files')

    model.to(device).eval()
    print('finish loading the damn model')
        
    model_dict = {'model': model, 'tokenizer':tokenizer}
    
    return model_dict
コード例 #15
0
ファイル: trainer.py プロジェクト: AviadSar/HaddasahRH
def get_model_and_tokenizer(args, type='pattern'):
    if type == 'pattern':
        dropout = args.pattern_dropout
    elif type == 'classifier':
        dropout = args.classifier_dropout
    else:
        raise ValueError('"type" argument for "get_model_and_tokenizer" mast be "pattern" or "classifier", not {}'.format(type))

    model, tokenizer = None, None
    if 'roberta' in args.model_name:
        tokenizer = RobertaTokenizerFast.from_pretrained(args.model_name)
        if args.model_type == 'sequence_classification':
            model = RobertaForSequenceClassification.from_pretrained(args.model_name,
                                                                     hidden_dropout_prob=dropout,
                                                                     attention_probs_dropout_prob=dropout,
                                                                     num_labels=args.num_labels)
        elif args.model_type == 'MLM':
            model = CompactRobertaForMaskedLM.from_pretrained(args.model_name,
                                                                  hidden_dropout_prob=dropout,
                                                                  attention_probs_dropout_prob=dropout)
        elif args.model_type == 'soft_label_classification':
            model = RobertaForSoftLabelSequenceClassification.from_pretrained(args.model_name,
                                                                     hidden_dropout_prob=dropout,
                                                                     attention_probs_dropout_prob=dropout,
                                                                     num_labels=args.num_labels)
    if model and args.eval:
        model = model.from_pretrained(args.model_dir)
    if model and tokenizer:
        model.resize_token_embeddings(len(tokenizer))
        return model, tokenizer
    raise Exception('no such model: name "{}", type "{}"'.format(args.model_name, args.model_type))
コード例 #16
0
ファイル: Interface.py プロジェクト: ouwenjie03/Clariq_System
 def _init_deep_model(self, model_type, model_path, num_labels, num_regs=None):
     if 'roberta' in model_type:
         tokenizer = RobertaTokenizer.from_pretrained(model_path)
         config = RobertaConfig.from_pretrained(model_path)
         config.num_labels = num_labels
         model = RobertaForSequenceClassification.from_pretrained(model_path, config=config)
         model.eval()
         model.to(self.device)
     elif 'electra_multitask' in model_type:
         tokenizer = ElectraTokenizer.from_pretrained(model_path)
         tokenizer.add_special_tokens({'additional_special_tokens': ['[VALUES]']})
         config = ElectraConfig.from_pretrained(model_path)
         config.num_labels = num_labels
         config.num_regs = num_regs
         config.vocab_size = len(tokenizer)
         model = ElectraForSequenceClassificationMultiTask.from_pretrained(model_path, config=config)
         model.eval()
         model.to(self.device)
     elif 'electra' in model_type:
         tokenizer = ElectraTokenizer.from_pretrained(model_path)
         config = ElectraConfig.from_pretrained(model_path)
         config.num_labels = num_labels
         model = ElectraForSequenceClassification.from_pretrained(model_path, config=config)
         model.eval()
         model.to(self.device)
     else:
         raise NotImplementedError()
     return config, tokenizer, model
コード例 #17
0
ファイル: ArgumentMap.py プロジェクト: lievan/argBERT
    def __init__(self, model_name, device: str = None):
        '''
          inputs: model_name (str), device: (str)

              -initializes argBERT model from the RobertaForSequenceClassification class in transformers
              -initializes tokenizer for argBERT model
              -moves argBERT model to GPU if GPU is available
        '''

        super(argBERT, self).__init__()
        self.argBERT = RobertaForSequenceClassification.from_pretrained(
            model_name)
        self.tokenizer = RobertaTokenizer.from_pretrained(
            model_name,
            bos_token='<s>',
            eos_token='</s>',
            unk_token='<unk>',
            pad_token='<pad>',
            mask_token='mask_token',
            sep_token="</s>",
            cls_token='<s>')
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"

        self.device = torch.device(device)
        self.argBERT.to(self.device)
        self.best_accuracy_score = 1000
コード例 #18
0
def main():

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    config = RobertaConfig.from_pretrained(cf.model_base,
                                           num_labels=cf.num_labels,
                                           finetuning_task=cf.finetuning_task)
    tokenizer = RobertaTokenizer.from_pretrained(cf.model_base,
                                                 do_lower_case=True)
    model = RobertaForSequenceClassification.from_pretrained(cf.model_base,
                                                             config=config)
    model.to(device)

    train_raw_text = get_raw_text(cf.train_file_dir)

    train_features = tokenize_raw_text(train_raw_text, tokenizer)

    train_dataset = create_dataset(train_features)

    optimizer = AdamW(model.parameters(),
                      lr=cf.learning_rate,
                      eps=cf.adam_epsilon)

    global_step, training_loss = train(dataset,
                                       model,
                                       optimizer,
                                       batch_size=cf.train_batch_size,
                                       num_epochs=cf.num_epochs)

    torch.save(model.state_dict(), cf.model_file_dir)
コード例 #19
0
def get_training_objects(params):
    """
    Define and return training objects
    """
    config = RobertaConfig.from_pretrained(params["model_name"], num_labels=2)
    model = RobertaForSequenceClassification.from_pretrained(
        params["model_name"], config=config)
    model.to(params["device"])
    no_decay = ["bias", "LayerNorm.weight"]
    gpd_params = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            params["weight_decay"],
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(gpd_params, lr=params["lr"], eps=params["adam_epsilon"])
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=params["warmup_steps"],
        num_training_steps=params["total_steps"],
    )
    return model, optimizer, scheduler
コード例 #20
0
 def __init__(self, pretrain_path, max_length): 
     nn.Module.__init__(self)
     self.roberta = RobertaForSequenceClassification.from_pretrained(
             pretrain_path,
             num_labels=2)
     self.max_length = max_length
     self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
コード例 #21
0
def model_init():
    """Returns an initialized model for use in a Hugging Face Trainer."""
    ## TODO: Return a pretrained RoBERTa model for sequence classification.
    ## See https://huggingface.co/transformers/model_doc/roberta.html#robertaforsequenceclassification.
    from transformers import RobertaForSequenceClassification
    model = RobertaForSequenceClassification.from_pretrained('roberta-base')
    return model
コード例 #22
0
 def __init__(self, model_config):
     self.model_dir = model_config['url']
     print("self.model_dir type: {}".format(type(self.model_dir)))
     print("self.model_dir: {}".format(self.model_dir))
     self.tokenizer = RobertaTokenizer.from_pretrained(self.model_dir)
     self.model = RobertaForSequenceClassification.from_pretrained(
         self.model_dir)
コード例 #23
0
    def __init__(self, config: Bunch) -> None:
        pl.LightningModule.__init__(self)
        self.config = config

        bpe_codes_path = os.path.join(
            config.pretrained_model_base_path,
            "BERTweet_base_transformers/bpe.codes",
        )
        bpe = fastBPE(Namespace(bpe_codes=bpe_codes_path))
        vocab = Dictionary()
        vocab.add_from_file(
            os.path.join(
                config.pretrained_model_base_path,
                "BERTweet_base_transformers/dict.txt",
            ))

        tokenizer = BertweetTokenizer(self.config.max_tokens_per_tweet, bpe,
                                      vocab)
        self.data_processor = BertweetDataProcessor(config, tokenizer)

        model_config = RobertaConfig.from_pretrained(
            os.path.join(
                config.pretrained_model_base_path,
                "BERTweet_base_transformers/config.json",
            ))
        self.model = RobertaForSequenceClassification.from_pretrained(
            os.path.join(
                config.pretrained_model_base_path,
                "BERTweet_base_transformers/model.bin",
            ),
            config=model_config,
        )
        self.loss = CrossEntropyLoss()
コード例 #24
0
    def __init__(self, model, model_size, args):
        super(MTModel, self).__init__()
        if model == 'bert':
            pretrained = BertForSequenceClassification.from_pretrained(
                f'bert-{model_size}-uncased',
                hidden_dropout_prob=args['hidden_dropout'],
                attention_probs_dropout_prob=args['attention_dropout'])
            self.main = pretrained.bert
            self.dropout = pretrained.dropout
        elif model == 'roberta':
            pretrained = RobertaForSequenceClassification.from_pretrained(
                f'roberta-{model_size}',
                hidden_dropout_prob=args['hidden_dropout'],
                attention_probs_dropout_prob=args['attention_dropout'])
            self.main = pretrained.roberta
            self.dropout = pretrained.dropout

        # Freeze embeddings' parameters for saving memory
        for param in self.main.embeddings.parameters():
            param.requires_grad = False

        linear_in_features = 768 if model_size == 'base' else 1024

        self.classifier_a = nn.Linear(in_features=linear_in_features,
                                      out_features=2,
                                      bias=True)
        self.classifier_b = nn.Linear(in_features=linear_in_features,
                                      out_features=3,
                                      bias=True)
        self.classifier_c = nn.Linear(in_features=linear_in_features,
                                      out_features=4,
                                      bias=True)
コード例 #25
0
    def __init__(
            self,
            device,
            model_path='/projects/bdata/talklife/dssg/ashish/Codes/Rewriting/RL_agent/models/rewards/coherence3.pt',
            batch_size=2):

        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base',
                                                          do_lower_case=True)
        self.batch_size = batch_size
        self.device = device

        self.model = RobertaForSequenceClassification.from_pretrained(
            "roberta-base",  # Use the 12-layer BERT model, with an uncased vocab.
            num_labels=
            2,  # The number of output labels--2 for binary classification.
            # You can increase this for multi-class tasks.
            output_attentions=
            False,  # Whether the model returns attentions weights.
            output_hidden_states=
            False,  # Whether the model returns all hidden-states.
        )

        self.model = torch.nn.DataParallel(self.model)

        weights = torch.load(model_path)
        self.model.load_state_dict(weights)

        self.model.to(self.device)
コード例 #26
0
 def __init__(self,
              args):
     super().__init__()
     self.args = args
     self.uniform_prior = args.uniform_prior
     if not self.uniform_prior:
         self.roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', output_hidden_states=True)
コード例 #27
0
 def __init__(self,
              model_directory: str,
              predictor_name: str,
              device="cuda") -> None:
     self.device = device
     self.config = RobertaConfig.from_pretrained(model_directory)
     # Load in model related information
     self._tokenizer = RobertaTokenizerFast.from_pretrained(
         model_directory, add_special_tokens=False)
     self._model = model = RobertaForSequenceClassification.from_pretrained(
         model_directory, config=self.config).to(device)
     self._model.eval()
     # Prepare optimizer
     no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
         {
             "params": [
                 p for n, p in model.named_parameters()
                 if not any(nd in n for nd in no_decay)
             ],
         },
         {
             "params": [
                 p for n, p in model.named_parameters()
                 if any(nd in n for nd in no_decay)
             ]
         },
     ]
     self._optimizer = AdamW(optimizer_grouped_parameters)
     self._optimizer.load_state_dict(
         torch.load(os.path.join(model_directory, "optimizer.pt")))
コード例 #28
0
def run_fold(name, fold_idx, seed):
    """
      Perform k-fold cross-validation
    """
    if name.__contains__('roberta-large'):
        # MODEL = 'roberta-large'
        model = RobertaForSequenceClassification.from_pretrained(name, num_labels=2)
        config.TOKENIZER = RobertaTokenizer.from_pretrained(name)

    elif name.__contains__('twitter-roberta-base'):
        # MODEL = 'cardiffnlp/twitter-roberta-base'
        model = RobertaForSequenceClassification.from_pretrained(name, num_labels=2)
        config.TOKENIZER = AutoTokenizer.from_pretrained(name)
    else:
        # MODEL = 'digitalepidemiologylab/covid-twitter-bert'
        model = BertForSequenceClassification.from_pretrained(name, num_labels=2)
        config.TOKENIZER = AutoTokenizer.from_pretrained(name)
    seed_all(seed=seed)
    df_train = pd.read_csv(config.TRAIN_FILE, sep='\t')
    # only when we use original data
    df_train.columns = ['tweet_id', 'user_id', 'tweet', 'label']
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    train = df_train

    # dividing folds
    kf = model_selection.StratifiedKFold(n_splits=config.KFOLD, shuffle=True, random_state=seed)
    idx = None

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=train, y=train.label.values)):
        train.loc[val_idx, 'kfold'] = int(fold)
        if fold == fold_idx:
            idx = val_idx

    if os.path.isfile(config.OOF_FILE):
        scores = pd.read_csv(config.OOF_FILE)
        print('Found oof file')
    else:
        scores = train.copy()
        scores['oof'] = 0
        scores.to_csv(config.OOF_FILE, index=False)
        print('Created oof file')
    df_train = train[train.kfold != fold_idx]
    df_val = train[train.kfold == fold_idx]
    y, y_dict = run(model, df_train, df_val, fold_idx)
    scores.loc[idx, 'oof'] = y

    scores.to_csv(config.OOF_FILE, index=False)
コード例 #29
0
def predict_pair(model_args, data_args, training_args):
    # Set seed
    set_seed(training_args.seed)

    if 'roberta' in model_args.model_type:
        tokenizer = RobertaTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
        config = RobertaConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = RobertaForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)
    elif 'electra' in model_args.model_type:
        tokenizer = ElectraTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
        config = ElectraConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = ElectraForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)
    else:
        # default -> bert
        tokenizer = BertTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
        config = BertConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = BertForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)

    model.to(training_args.device)

    test_df = pickle.load(open(data_args.test_data_file, 'rb'))
    test_dataset = get_dataset(data_args, tokenizer, test_df, model_args.model_type)
    data_collator = MyDataCollator()
    if training_args.local_rank != -1:
        sampler = SequentialDistributedSampler(test_dataset)
        model = torch.nn.DataParallel(model)
    else:
        n_gpu = torch.cuda.device_count()
        if n_gpu > 1:
            model = torch.nn.DataParallel(model)
        sampler = SequentialSampler(test_dataset)
    print(len(test_dataset))
    dataloader = DataLoader(
        test_dataset,
        sampler=sampler,
        batch_size=training_args.eval_batch_size,
        collate_fn=data_collator,
    )

    model.eval()
    all_probs = []
    for inputs in tqdm(dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(training_args.device)
        inputs.pop('labels')
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs[0]
            probs = torch.softmax(logits, dim=-1)
            maxp, maxi = torch.max(probs, dim=-1)
            result = [(_i, _p) for _p, _i in zip(maxp, maxi)]
            all_probs.extend(result)

    with open('./{}_{}.answer_classify.result'.format(data_args.data_type, model_args.model_type), 'w', encoding='utf-8') as fout:
        for i in range(len(test_df)):
            fout.write('{} | {} | {} | {} | {}\n'.format(test_df[i][0], test_df[i][1], test_df[i][2], all_probs[i][0], all_probs[i][1]))
コード例 #30
0
 def __init__(self, model_pn="D:/Language Models/ROBERTA-LARGE/"):
     self.model = RobertaForSequenceClassification.from_pretrained(model_pn)
     self.tokenizer = RobertaTokenizer.from_pretrained(model_pn)
     self.ft_weights = torch.load(f'{model_pn}{"detector-large.pt"}', map_location='cpu')
     self.model.to(torch.device("cpu"))
     self.model.load_state_dict(self.ft_weights['model_state_dict'], strict=False)
     self.model.eval()
     self._text = None