Exemple #1
0
 def __init__(self, args):
     pretrain_name = 'bert-base-cased'
     if args.model_info.bert_path:
         pretrain_name = args.model_info.bert_path
     print(f"Tokenizer from:{pretrain_name}")
     train_conf = args.train_info
     model_conf = args.model_info
     self.model_type = model_conf.model
     if self.model_type == 'bert_seq':
         self.model = BertClassifier(model_conf)
         self.tokenizer = BertTokenizer.from_pretrained(pretrain_name)
         self.ds = SentimentDataset
     if self.model_type == 'GPT2':
         self.model = GPT2Classifier(model_conf)
         self.tokenizer = GPT2Tokenizer.from_pretrained(pretrain_name)
         self.ds = GPT2Dataset
     self.model.load_state_dict(torch.load(train_conf.model_path))
     self.device = train_conf.device
     self.class_num = model_conf.class_num
     self.model.to(self.device)
     self.lr = train_conf.lr
     self.max_len = train_conf.max_seq_len
     self.conf = args
     self.label_map = json.load(open(args.label_map_path))
     self.id2label = dict([(i, label_str)
                           for label_str, i in self.label_map.items()])
Exemple #2
0
    def __init__(self, config, RecognitionResultManager):
        self.config = config
        self.genre_keywords_db = self.get_db(self.config['DB']['host'],
                                             self.config['DB']['db_name'],
                                             self.config['DB']['user'])
        self.rrm = RecognitionResultManager

        self.model_path = "/Users/jinsakuma/Downloads/model_gpu_v4.3.3.pth"
        self.model_config = BertConfig.from_pretrained(
            'cl-tohoku/bert-base-japanese-whole-word-masking',
            output_attentions=True)
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(
            'cl-tohoku/bert-base-japanese-whole-word-masking')
        self.bert_model = BertModel.from_pretrained(
            'cl-tohoku/bert-base-japanese-whole-word-masking',
            config=self.model_config)
        self.model = BertClassifier(self.bert_model)
        self.max_len = 30
        self.load_weights(self.model_path)

        self.device = torch.device("cpu")

        self.order_list = [
            'recommendation', 'title', 'abstract', 'review', 'evaluation',
            'actor', 'genre', 'director', None
        ]
Exemple #3
0
class Predictor(object):
    def __init__(self, args):
        pretrain_name = 'bert-base-cased'
        if args.model_info.bert_path:
            pretrain_name = args.model_info.bert_path
        self.tokenizer = BertTokenizer.from_pretrained(pretrain_name)
        print(f"Tokenizer from:{pretrain_name}")
        train_conf = args.train_info
        model_conf = args.model_info
        self.device = train_conf.device
        self.class_num = model_conf.class_num
        self.model = BertClassifier(model_conf)
        self.model.load_state_dict(
            torch.load(train_conf.model_path,
                       map_location=torch.device(self.device)))
        self.model.to(self.device)
        self.lr = train_conf.lr
        self.max_len = train_conf.max_seq_len
        self.conf = args
        self.label_map = json.load(open(args.label_map_path))
        self.id2label = dict([(i, label_str)
                              for label_str, i in self.label_map.items()])
        self.softmax = Softmax(dim=1)

    def predict(self, sens):
        d_loader = self.sen_2_dl(sens)
        y_pred = list()
        with torch.no_grad():
            for batch in d_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                logits = self.model(input_ids, attention_mask)
                logits = torch.sigmoid(logits)
                y_pred.append(logits)
        y_pred = torch.cat(y_pred)
        y_pred = y_pred.cpu().numpy()
        res = list()
        for y in y_pred:
            res.append(self._score_2_dict(y))
        return res

    def _score_2_dict(self, single_pred):
        res = dict()
        for i, score in enumerate(single_pred):
            label_str = self.id2label[i]
            res[label_str] = float(score)
        return res

    def sen_2_dl(self, sens):
        texts = [i.strip() for i in sens]
        labels = [
            999
        ]  # this is a invalid parameter but dataloader needs the this
        ds = SentimentDataset(self.tokenizer, texts, labels, self.max_len)
        _loader = dataloader.DataLoader(
            ds, batch_size=self.conf.train_info.batch_size, shuffle=False)
        return _loader
Exemple #4
0
 def create_model(self, num_train_step, num_warmup_step):
     """
     根据config文件选择对应的模型,并初始化
     :return:
     """
     model = BertClassifier(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step)
     return model
Exemple #5
0
 def __init__(self, args):
     pretrain_name = 'bert-base-cased'
     if args.model_info.bert_path:
         pretrain_name = args.model_info.bert_path
     self.tokenizer = BertTokenizer.from_pretrained(pretrain_name)
     print(f"Tokenizer from:{pretrain_name}")
     train_conf = args.train_info
     model_conf = args.model_info
     self.device = train_conf.device
     self.class_num = model_conf.class_num
     self.model = BertClassifier(model_conf)
     self.model.load_state_dict(
         torch.load(train_conf.model_path,
                    map_location=torch.device(self.device)))
     self.model.to(self.device)
     self.lr = train_conf.lr
     self.max_len = train_conf.max_seq_len
     self.conf = args
     self.label_map = json.load(open(args.label_map_path))
     self.id2label = dict([(i, label_str)
                           for label_str, i in self.label_map.items()])
     self.softmax = Softmax(dim=1)
Exemple #6
0
class Trainer(object):
    def __init__(self, config):
        self.config = config
        self.data_processor = DataProcessor("/Users/a5560648/workspace/tutor/data", max_len=config["max_len"])
        self.model = BertClassifier(config=config)

    def train(self):
        data_loader = DataLoader(self.data_processor.get_dataset(), batch_size=config["batch_size"], shuffle=True, drop_last=True)
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config["lr"]) 
        loss_fn = torch.nn.functional.cross_entropy
        for epoch in range(self.config["epoch"]):
            with tqdm(total=len(data_loader)) as pbar:
                for input_ids, token_type_ids, attention_mask, labels in data_loader:
                    optimizer.zero_grad()
                    output = self.model(input_ids, token_type_ids, attention_mask)
                    loss = loss_fn(output, labels)
                    loss.backward()
                    optimizer.step()
                    pbar.update(1)
Exemple #7
0
class NLU:
    def __init__(self, config, RecognitionResultManager):
        self.config = config
        self.genre_keywords_db = self.get_db(self.config['DB']['host'],
                                             self.config['DB']['db_name'],
                                             self.config['DB']['user'])
        self.rrm = RecognitionResultManager

        self.model_path = "/Users/jinsakuma/Downloads/model_gpu_v4.3.3.pth"
        self.model_config = BertConfig.from_pretrained(
            'cl-tohoku/bert-base-japanese-whole-word-masking',
            output_attentions=True)
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(
            'cl-tohoku/bert-base-japanese-whole-word-masking')
        self.bert_model = BertModel.from_pretrained(
            'cl-tohoku/bert-base-japanese-whole-word-masking',
            config=self.model_config)
        self.model = BertClassifier(self.bert_model)
        self.max_len = 30
        self.load_weights(self.model_path)

        self.device = torch.device("cpu")

        self.order_list = [
            'recommendation', 'title', 'abstract', 'review', 'evaluation',
            'actor', 'genre', 'director', None
        ]

    def get_db(self, host="localhost", db_name="woz_system", user="******"):
        '''
        MySQLから発話内容を一括取得
        :return: db (dict)
        '''
        connector = MySQLdb.connect(host=host,
                                    db=db_name,
                                    user=user,
                                    passwd="",
                                    charset="utf8")
        cursor = connector.cursor()  # カーソル(概念)を作成
        # 映画推薦用キーワード
        cursor.execute('select * from genre')
        genres = cursor.fetchall()
        genre_keywords_db = {}
        for genre in genres:
            genre_id = genre[1]
            genre_type = genre[2]  # .encode('utf-8')
            genre_keywords_db[genre_type] = []
            cursor.execute(
                'select keywords from genre_keywords where genre_id={}'.format(
                    genre_id))
            keywords = cursor.fetchall()
            keyword_list = keywords[0][0].split(",")
            genre_keywords_db[genre_type] = keyword_list
        return genre_keywords_db

    def load_weights(self, model_path):
        load_weights = torch.load(model_path, map_location={'cuda:0': 'cpu'})
        self.model.load_state_dict(load_weights)

    def bert_tokenizer(self, input_text):
        return self.tokenizer.encode(input_text,
                                     max_length=self.max_len,
                                     truncation=True,
                                     return_tensors='pt')[0]

    def get_order(self, input_text):
        token = self.bert_tokenizer(input_text)
        token = token.unsqueeze(0)
        output, attentions = self.model(token.to(self.device))
        _, pred = torch.max(output, 1)

        print("NLU result: ", self.order_list[pred.item()])
        return self.order_list[pred.item()]

    def get_text(self, N):
        df = self.rrm.get_df()
        text_list = df['transcript'].iloc[-N:].tolist()
        target_list = df['speaker'].iloc[-N:].tolist()
        return text_list, target_list

    def check_genre(self, input_texts):
        # キーワードマッチング
        for text in reversed(input_texts):
            for response_type, keywords in self.genre_keywords_db.items():
                for keyword in keywords:
                    if keyword in text:
                        return response_type

        return None
Exemple #8
0
def main(args, f):
    set_seed(args.train_seed)
    if args.model in ['roberta', 'distilroberta']:
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # preprocess data
    src_eval_loader, src_loader, tgt_all_loader, tgt_train_loader, tgt_te = get_all_dataloader(
        args, tokenizer)

    # load models
    if args.model == 'bert':
        src_encoder = BertEncoder()
        # encoder = BertEncoder()
        classifier = BertClassifier()
    elif args.model == 'distilbert':
        src_encoder = DistilBertEncoder()
        # encoder = DistilBertEncoder()
        classifier = BertClassifier()
    elif args.model == 'roberta':
        src_encoder = RobertaEncoder()
        # encoder = RobertaEncoder()
        classifier = RobertaClassifier()
    else:
        src_encoder = DistilRobertaEncoder()
        # encoder = DistilRobertaEncoder()
        classifier = RobertaClassifier()
    discriminator = Discriminator()

    # parallel models
    if torch.cuda.device_count() > 1:
        print('Let\'s use {} GPUs!'.format(torch.cuda.device_count()))
        src_encoder = nn.DataParallel(src_encoder)
        classifier = nn.DataParallel(classifier)
        # encoder = nn.DataParallel(encoder)
        discriminator = nn.DataParallel(discriminator)

    if args.load:
        src_encoder = init_model(args,
                                 src_encoder,
                                 restore_path=param.src_encoder_path)
        classifier = init_model(args,
                                classifier,
                                restore_path=param.src_classifier_path)
        # encoder = init_model(args, encoder, restore_path=param.tgt_encoder_path)
        # discriminator = init_model(args, discriminator, restore_path=param.d_model_path)
    else:
        src_encoder = init_model(args, src_encoder)
        classifier = init_model(args, classifier)

    # encoder = init_model(args, encoder)
    discriminator = init_model(args, discriminator)

    # train source model
    if args.pretrain:
        print("=== Training classifier for source domain ===")
        src_encoder, classifier = pretrain(args, src_encoder, classifier,
                                           src_loader)

        # save pretrained model
        # save_model(args, src_encoder, param.src_encoder_path)
        # save_model(args, classifier, param.src_classifier_path)

    # eval source model
    print("=== Evaluating classifier for source domain ===")
    evaluate(args, src_encoder, classifier, src_loader)
    src_acc = evaluate(args, src_encoder, classifier, tgt_all_loader)
    f.write(f'{args.src} -> {args.tgt}: No adapt acc on src data: {src_acc}\n')

    # adapt
    print("=== Adapt tgt encoder ===")
    # encoder.load_state_dict(src_encoder.state_dict())
    # if args.src_free:
    # s_res_features = src_gmm(args, src_encoder, src_loader)
    # src_loader = s_numpy_dataloader(s_res_features, args.batch_size)
    # encoder = aad_adapt_src_free(args, src_encoder, encoder, discriminator,
    #                                  classifier, src_loader, tgt_train_loader, tgt_all_loader)
    # else:
    if args.adapt:
        encoder, classifier = shot_adapt(args, src_encoder, classifier,
                                         tgt_train_loader, tgt_all_loader,
                                         tgt_te)

    # save_model(args, encoder, param.tgt_encoder_path)

    # argument setting
    # print("=== Argument Setting ===")
    print(
        f"model_type: {args.model}; max_seq_len: {args.max_seq_length}; batch_size: {args.batch_size}; "
        f"pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; src: {args.src}; tgt: {args.tgt}; "
        f'src_free: {args.src_free}; dp: {args.dp}')

    # eval target encoder on lambda0.1 set of target dataset
    print("=== Evaluating classifier for encoded target domain ===")
    print(">>> domain adaption <<<")
    tgt_acc = evaluate(args, encoder, classifier, tgt_all_loader)
    f.write(f'{args.src} -> {args.tgt}: DA acc on tgt data: {tgt_acc}\n')
    f.write(
        f"model_type: {args.model}; batch_size: {args.batch_size}; pre_epochs: {args.pre_epochs}; "
        f"num_epochs: {args.num_epochs}; src_free: {args.src_free}; src: {args.src}; "
        f"tgt: {args.tgt}; dp: {args.dp}\n\n")
Exemple #9
0
 def __init__(self, config):
     self.config = config
     self.data_processor = DataProcessor("/Users/a5560648/workspace/tutor/data", max_len=config["max_len"])
     self.model = BertClassifier(config=config)
Exemple #10
0
def main():
    args = parse_arguments()
    # argument setting
    print("=== Argument Setting ===")
    print("src: " + args.src)
    print("tgt: " + args.tgt)
    print("alpha: " + str(args.alpha))
    print("seed: " + str(args.seed))
    print("train_seed: " + str(args.train_seed))
    print("model_type: " + str(args.model))
    print("max_seq_length: " + str(args.max_seq_length))
    print("batch_size: " + str(args.batch_size))
    print("num_epochs: " + str(args.num_epochs))
    set_seed(args.train_seed)

    if args.model == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # preprocess data
    print("=== Processing datasets ===")
    if args.src == 'blog':
        src_x, src_y = CSV2Array(os.path.join('data', args.src, 'blog.csv'))

    elif args.src == 'airline':
        src_x, src_y = CSV2Array(os.path.join('data', args.src, 'airline.csv'))

    else:
        src_x, src_y = XML2Array(
            os.path.join('data', args.src, 'negative.review'),
            os.path.join('data', args.src, 'positive.review'))

    src_x, src_test_x, src_y, src_test_y = train_test_split(
        src_x, src_y, test_size=0.2, stratify=src_y, random_state=args.seed)

    if args.tgt == 'blog':
        tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'blog.csv'))

    elif args.tgt == 'airline':
        tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'airline.csv'))
    else:
        tgt_x, tgt_y = XML2Array(
            os.path.join('data', args.tgt, 'negative.review'),
            os.path.join('data', args.tgt, 'positive.review'))

    tgt_train_x, _, tgt_train_y, _ = train_test_split(tgt_x,
                                                      tgt_y,
                                                      test_size=0.2,
                                                      stratify=tgt_y,
                                                      random_state=args.seed)

    if args.model == 'roberta':
        src_features = roberta_convert_examples_to_features(
            src_x, src_y, args.max_seq_length, tokenizer)
        src_test_features = roberta_convert_examples_to_features(
            src_test_x, src_test_y, args.max_seq_length, tokenizer)
        tgt_features = roberta_convert_examples_to_features(
            tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer)
        tgt_all_features = roberta_convert_examples_to_features(
            tgt_x, tgt_y, args.max_seq_length, tokenizer)
    else:
        src_features = convert_examples_to_features(src_x, src_y,
                                                    args.max_seq_length,
                                                    tokenizer)
        src_test_features = convert_examples_to_features(
            src_test_x, src_test_y, args.max_seq_length, tokenizer)
        tgt_features = convert_examples_to_features(tgt_train_x, tgt_train_y,
                                                    args.max_seq_length,
                                                    tokenizer)
        tgt_all_features = convert_examples_to_features(
            tgt_x, tgt_y, args.max_seq_length, tokenizer)

    # load dataset

    src_data_loader = get_data_loader(src_features, args.batch_size)
    src_data_loader_eval = get_data_loader(src_test_features, args.batch_size)
    tgt_data_loader = get_data_loader(tgt_features, args.batch_size)
    tgt_data_loader_all = get_data_loader(tgt_all_features, args.batch_size)

    # load models
    if args.model == 'bert':
        encoder = BertEncoder()
        cls_classifier = BertClassifier()
        dom_classifier = DomainClassifier()
    elif args.model == 'distilbert':
        encoder = DistilBertEncoder()
        cls_classifier = BertClassifier()
        dom_classifier = DomainClassifier()
    else:
        encoder = RobertaEncoder()
        cls_classifier = RobertaClassifier()
        dom_classifier = RobertaDomainClassifier()

    if args.load:
        encoder = init_model(encoder, restore=param.encoder_path)
        cls_classifier = init_model(cls_classifier,
                                    restore=param.cls_classifier_path)
        dom_classifier = init_model(dom_classifier,
                                    restore=param.dom_classifier_path)
    else:
        encoder = init_model(encoder)
        cls_classifier = init_model(cls_classifier)
        dom_classifier = init_model(dom_classifier)

    print("=== Start Training ===")
    if args.train:
        encoder, cls_classifier, dom_classifier = train(
            args, encoder, cls_classifier, dom_classifier, src_data_loader,
            src_data_loader_eval, tgt_data_loader, tgt_data_loader_all)

    print("=== Evaluating classifier for encoded target domain ===")
    print(">>> after training <<<")
    evaluate(encoder, cls_classifier, tgt_data_loader_all)
def main(paras):

    logger = logging.getLogger(__name__)
    if paras.save_log_file:
        logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                            datefmt='%m/%d/%Y %H:%M:%S',
                            level=paras.logging_level,
                            filename=f'{paras.log_save_path}/{paras.train_log_file}',
                            filemode='w')
    else:
        logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                            datefmt='%m/%d/%Y %H:%M:%S',
                            level=paras.logging_level, )

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    logger.info(f'Loading model: {paras.model_name}')
    tokenizer = BertTokenizer.from_pretrained(paras.model_name)
    bert = BertModel.from_pretrained(paras.model_name)


    train_dataset = RE_Dataset(paras, 'train')
    train_dataloaer = DataLoader(train_dataset, batch_size=paras.batch_size,
                                 shuffle=paras.shuffle, drop_last=paras.drop_last)
    label_to_index = train_dataset.label_to_index
    special_token_list = list(train_dataset.special_token_set)
    # fixme: add special token to tokenizer
    special_tokens_dict = {'additional_special_tokens': special_token_list}
    tokenizer.add_special_tokens(special_tokens_dict)
    # bert.resize_token_embeddings(len(tokenizer))

    test_dataset = RE_Dataset(paras, 'test')
    test_dataloader = DataLoader(test_dataset, batch_size=paras.batch_size,
                                 shuffle=paras.shuffle, drop_last=paras.drop_last)

    bert_classifier = BertClassifier(bert, paras.hidden_size, paras.label_number,
                                     paras.dropout_prob)

    if paras.optimizer == 'adam':
        logger.info('Loading Adam optimizer.')
        optimizer = torch.optim.Adam(bert_classifier.parameters(), lr=paras.learning_rate)
    elif paras.optimizer == 'adamw':
        logger.info('Loading AdamW optimizer.')
        no_decay = [ 'bias', 'LayerNorm.weight' ]
        optimizer_grouped_parameters = [
            {'params': [ p for n, p in bert_classifier.named_parameters() if not any(nd in n for nd in no_decay) ],
             'weight_decay': 0.01},
            {'params': [ p for n, p in bert_classifier.named_parameters() if any(nd in n for nd in no_decay) ],
             'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=paras.learning_rate,
                          eps=args.adam_epsilon)
    else:
        logger.warning(f'optimizer must be "Adam" or "AdamW", but got {paras.optimizer}.')
        logger.info('Loading Adam optimizer.')
        optimizer = torch.optim.Adam(bert_classifier.parameters(),
                                     lr=paras.learning_rate)


    logger.info('Training Start.')
    best_eval = {'acc': 0, 'precision': 0, 'recall': 0, 'f1': 0, 'loss': 0}
    for epoch in range(paras.num_train_epochs):
        epoch_loss = 0
        bert_classifier.train()
        for step, batch in enumerate(train_dataloaer):
            optimizer.zero_grad()

            batch_data, batch_label = batch

            encoded_data = tokenizer(batch_data,
                                     padding=True,
                                     truncation=True,
                                     return_tensors='pt',
                                     max_length=paras.max_sequence_length)

            label_tensor = batch_label_to_idx(batch_label, label_to_index)

            loss = bert_classifier(encoded_data, label_tensor)

            epoch_loss += loss_to_int(loss)

            logging.info(f'epoch: {epoch}, step: {step}, loss: {loss:.4f}')

            # fixme: del
            # acc, precision, recall, f1 = evaluation(bert_classifier, tokenizer, test_dataloader,
            #                                         paras.max_sequence_length, label_to_index)
            # logger.info(f'Accuracy: {acc:.4f}, Precision: {precision:.4f}, '
            #             f'Recall: {recall:.4f}, F1-score: {f1:.4f}')

            loss.backward()
            optimizer.step()

        epoch_loss = epoch_loss / len(train_dataloaer)

        acc, precision, recall, f1 = evaluation(bert_classifier, tokenizer, test_dataloader,
                                                paras.max_sequence_length, label_to_index)

        logging.info(f'Epoch: {epoch}, Epoch-Average Loss: {epoch_loss:.4f}')
        logger.info(f'Accuracy: {acc:.4f}, Precision: {precision:.4f}, '
                    f'Recall: {recall:.4f}, F1-score: {f1:.4f}')

        if best_eval['loss'] == 0 or f1 > best_eval['f1']:
            best_eval['loss'] = epoch_loss
            best_eval['acc'] = acc
            best_eval['precision'] = precision
            best_eval['recall'] = recall
            best_eval['f1'] = f1
            torch.save(bert_classifier, f'{paras.log_save_path}/{paras.model_save_name}')

            with open(f'{paras.log_save_path}/{paras.checkpoint_file}', 'w') as wf:
                wf.write(f'Save time: {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}\n')
                wf.write(f'Best F1-score: {best_eval["f1"]:.4f}\n')
                wf.write(f'Precision: {best_eval["precision"]:.4f}\n')
                wf.write(f'Recall: {best_eval["recall"]:.4f}\n')
                wf.write(f'Accuracy: {best_eval["acc"]:.4f}\n')
                wf.write(f'Epoch-Average Loss: {best_eval["loss"]:.4f}\n')

            logger.info(f'Updated model, best F1-score: {best_eval["f1"]:.4f}\n')

    logger.info(f'Train complete, Best F1-score: {best_eval["f1"]:.4f}.')
Exemple #12
0
# coding: utf-8
# @File: predict.py
# @Author: HE D.H.
# @Email: [email protected]
# @Time: 2020/10/10 17:13:57
# @Description:

import torch
from model import BertClassifier
from transformers import BertTokenizer, BertConfig

labels = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经']
bert_config = BertConfig.from_pretrained('bert-base-chinese')

model = BertClassifier(bert_config, len(labels))
model.load_state_dict(
    torch.load('models/best_model.pkl', map_location=torch.device('cpu')))

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

print('新闻类别分类')
while True:
    text = input('Input: ')
    token = tokenizer(text,
                      add_special_tokens=True,
                      padding='max_length',
                      truncation=True,
                      max_length=512)
    input_ids = token['input_ids']
    attention_mask = token['attention_mask']
    token_type_ids = token['token_type_ids']
Exemple #13
0
    attention_mask = (x != 0).float().to(config.DEVICE).long()
    outputs = MODEL(x, attention_mask=attention_mask)
    return outputs.cpu().detach().numpy()


@app.route('/predict')
def predict():
    comment = request.args.get('comment')
    start_time = time.time()
    prediction = comment_prediction(comment)
    response = {
        'response': {
            label: str(prob)
            for label, prob in zip(config.CLASS_COLS, prediction[0])
        }
    }
    response['response']['comment'] = comment
    response['response']['time_taken'] = str(time.time() - start_time)

    return flask.jsonify(response)


if __name__ == '__main__':
    bert_config = BertConfig.from_pretrained(config.BERT_NAME)
    bert_config.num_labels = config.NUM_CLASSES
    MODEL = BertClassifier(bert_config)
    MODEL.load_state_dict(torch.load(config.TRAINED_MODEL_PATH))
    MODEL.to(config.DEVICE)
    MODEL.eval()
    app.run(host=config.HOST, port=config.PORT)
Exemple #14
0
def train(dataloader,
          head_trans,
          body_trans,
          classifier,
          load_model=False,
          save_model=True,
          num_epochs=2):

    torch.backends.cudnn.benchmark = True
    # device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = 'cpu'
    print(device)
    load_model = load_model
    save_model = save_model

    learning_rate = 3e-3
    num_epochs = num_epochs

    # For tensorboard
    writer = SummaryWriter('runs/bert')
    step = 0

    # Initialize Model
    model = BertClassifier(head_trans, body_trans, classifier).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    if load_model:
        model, optimizer, step = load_checkpoint(
            torch.load('bert_chkpnt/my_checkpoint.pth.tar'), model, optimizer)
        return model

    for epoch in range(num_epochs):
        if save_model:
            checkpoint = {
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'step': step
            }
            save_checkpoint(checkpoint)

        loop = tqdm(enumerate(dataloader), total=len(dataloader), leave=False)

        for batch, (head, body, stance) in loop:

            outputs = model(head.to(device), body.to(device))
            breakpoint()
            loss = criterion(outputs.float(), stance.to(device).long())

            writer.add_scalar('Training Loss', loss.item(), step)
            step += 1

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Update progress bar
            loop.set_description(f'Epoch [{epoch+1}/{num_epochs}]')
            loop.set_postfix(loss=loss.item())

            running_loss += loss.item()
            running_accuracy += (
                (torch.argmax(outputs, dim=1)
                 == stance.to(device)).sum().item()) / BATCH_SIZE
            if (batch + 1) % 10 == 0:
                writer.add_scalar('Running Loss', running_loss / 10,
                                  epoch * len(dataloader) + batch)
                writer.add_scalar('Running Accuracy', running_accuracy / 10,
                                  epoch * len(dataloader) + batch)

                running_loss = 0.0
                running_accuracy = 0

    return model
def main():
    device = torch.device('cuda:3')
    # 获取到dataset
    print('加载训练数据')
    train_data = load_data('dataset/train.csv')
    print('加载验证数据')
    valid_data = load_data('dataset/test.csv')
    # test_data = load_data('cnews/cnews.test.txt')

    batch_size = 16

    # 生成Batch
    print('生成batch')
    train_dataloader = DataLoader(train_data,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=3)
    valid_dataloader = DataLoader(valid_data,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  num_workers=3)
    # test_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False)

    # 读取BERT的配置文件
    bert_config = BertConfig.from_pretrained('./chinese_wwm_pytorch')
    bert_config.num_labels = num_labels
    print(bert_config)

    # 初始化模型
    model = BertClassifier(bert_config)
    # model.to(device)

    # 参数设置
    EPOCHS = 20
    learning_rate = 5e-6  # Learning Rate不宜太大
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    # 损失函数采用交叉熵
    criterion = nn.CrossEntropyLoss()

    with open('output.txt', 'w') as wf:
        wf.write('Batch Size: ' + str(batch_size) + '\tLearning Rate: ' +
                 str(learning_rate) + '\n')

    best_acc = 0
    # 设置并行训练,模型默认是把参数放在device[0]对应的gpu编号的gpu上,所以这里应该和上面设置的cuda:2对应
    net = torch.nn.DataParallel(model, device_ids=[3, 4])
    net.to(device)
    # model.module.avgpool = nn.AdaptiveAvgPool2d(7)
    # 开始训练
    for Epoch in range(1, EPOCHS + 1):
        losses = 0  # 损失
        accuracy = 0  # 准确率
        print('Epoch:', Epoch)

        model.train()
        for batch_index, batch in enumerate(train_dataloader):
            # print(batch_index)
            # print(batch)
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            token_type_ids = batch[2].to(device)
            label_ids = batch[3].to(device)
            # 将三个输入喂到模型中
            output = net(  # forward
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
            )

            loss = criterion(output, label_ids)
            losses += loss.item()

            pred_labels = torch.argmax(output, dim=1)  # 预测出的label
            acc = torch.sum(pred_labels == label_ids.to(device)).item() / len(
                pred_labels)  # acc
            accuracy += acc
            # 打印训练过程中的准确率以及loss
            # print('Epoch: %d | Train: | Batch: %d / %d | Acc: %f | Loss: %f' % (Epoch, batch_index + 1, len(train_dataloader), acc, loss.item()))
            # 模型梯度置零,损失函数反向传播,优化更新
            model.zero_grad()
            loss.backward()
            optimizer.step()
            # torch.cuda.empty_cache()

        average_loss = losses / len(train_dataloader)
        average_acc = accuracy / len(train_dataloader)
        # 打印该epoch训练结果的
        print('\tTrain ACC:', average_acc, '\tLoss:', average_loss)
        # with open('output.txt', 'a') as rf:
        #     output_to_file = '\nEpoch: ' + str(Epoch) + '\tTrain ACC:' + str(average_acc) + '\tLoss: ' + str(
        #         average_loss)
        #     rf.write(output_to_file)

        # 验证
        model.eval()
        losses = 0  # 损失
        accuracy = 0  # 准确率
        # 在验证集上进行验证
        for batch_index, batch in enumerate(valid_dataloader):
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            token_type_ids = batch[2].to(device)
            label_ids = batch[3].to(device)
            with torch.no_grad():
                output = model(  # forward
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                )
            loss = criterion(output, label_ids)
            losses += loss.item()
            # 这里的两部操作都是直接对生成的结果张量进行操作
            pred_labels = torch.argmax(output, dim=1)  # 预测出的label
            acc = torch.sum(pred_labels == label_ids.to(device)).item() / len(
                pred_labels)  # acc
            accuracy += acc

        average_loss = losses / len(valid_dataloader)
        average_acc = accuracy / len(valid_dataloader)

        print('\tValid ACC:', average_acc, '\tLoss:', average_loss)
        # with open('output.txt', 'a') as rf:
        #     output_to_file = '\nEpoch: ' + str(Epoch) + '\tValid ACC:' + str(average_acc) + '\tLoss: ' + str(
        #         average_loss) + '\n'
        #     rf.write(output_to_file)

        if average_acc > best_acc:
            best_acc = average_acc
            torch.save(model.state_dict(), 'best_model_on_trainset.pkl')
Exemple #16
0
def main(args, f):
    # args = parse_arguments()
    set_seed(args.train_seed)

    if args.model in ['roberta', 'distilroberta']:
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # preprocess data
    src_eval_loader, src_loader, tgt_all_loader, tgt_train_loader = get_all_dataloader(
        args, tokenizer)

    # load models
    if args.model == 'bert':
        src_encoder = BertEncoder()
        tgt_encoder = BertEncoder()
        src_classifier = BertClassifier()
    elif args.model == 'distilbert':
        src_encoder = DistilBertEncoder()
        tgt_encoder = DistilBertEncoder()
        src_classifier = BertClassifier()
    elif args.model == 'roberta':
        src_encoder = RobertaEncoder()
        tgt_encoder = RobertaEncoder()
        src_classifier = RobertaClassifier()
    else:
        src_encoder = DistilRobertaEncoder()
        tgt_encoder = DistilRobertaEncoder()
        src_classifier = RobertaClassifier()
    discriminator = Discriminator()  # output dims is 2 instead of 1

    if args.load:
        src_encoder = init_model(args,
                                 src_encoder,
                                 restore_path=param.src_encoder_path)
        src_classifier = init_model(args,
                                    src_classifier,
                                    restore_path=param.src_classifier_path)
        tgt_encoder = init_model(args,
                                 tgt_encoder,
                                 restore_path=param.tgt_encoder_path)
        discriminator = init_model(args,
                                   discriminator,
                                   restore_path=param.d_model_path)
    else:
        src_encoder = init_model(args, src_encoder)
        src_classifier = init_model(args, src_classifier)
        tgt_encoder = init_model(args, tgt_encoder)
        discriminator = init_model(args, discriminator)

    # parallel models
    if torch.cuda.device_count() > 1:
        print('Let\'s use {} GPUs!'.format(torch.cuda.device_count()))
        src_encoder = nn.DataParallel(src_encoder)
        src_classifier = nn.DataParallel(src_classifier)
        tgt_encoder = nn.DataParallel(tgt_encoder)
        discriminator = nn.DataParallel(discriminator)

    # train source model
    print("=== Training classifier for source domain ===")
    if args.pretrain:
        src_encoder, src_classifier = pretrain(args, src_encoder,
                                               src_classifier, src_loader)

    # eval source model
    print("=== Evaluating classifier for source domain ===")
    evaluate(args, src_encoder, src_classifier, src_loader)
    src_acc = evaluate(args, src_encoder, src_classifier, tgt_all_loader)
    f.write(f'{args.src} -> {args.tgt}: No adapt acc on src data: {src_acc}\n')

    for params in src_encoder.parameters():
        params.requires_grad = False

    # train target encoder by ADDA
    print("=== Training encoder for target domain ===")
    if args.adapt:
        tgt_encoder.load_state_dict(src_encoder.state_dict())
        tgt_encoder = adda_adapt(args, src_encoder, tgt_encoder, discriminator,
                                 src_loader, tgt_train_loader)

    # argument setting
    print(
        f"model_type: {args.model}; max_seq_len: {args.max_seq_length}; batch_size: {args.batch_size}; "
        f"pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; src: {args.src}; tgt: {args.tgt}"
    )

    # eval target encoder on lambda0.1 set of target dataset
    print("=== Evaluating classifier for encoded target domain ===")
    print(">>> domain adaption <<<")
    tgt_acc = evaluate(args, tgt_encoder, src_classifier, tgt_all_loader)
    f.write(f'{args.src} -> {args.tgt}: DA acc on tgt data: {tgt_acc}\n')
    f.write(
        f"model_type: {args.model}; batch_size: {args.batch_size}; pre_epochs: {args.pre_epochs}; "
        f"num_epochs: {args.num_epochs}; src_free: {args.src_free}; src: {args.src}; "
        f"tgt: {args.tgt};\n\n")
Exemple #17
0
def main():

    # 参数设置
    batch_size = 4
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    epochs = 10
    learning_rate = 5e-6  #Learning Rate不宜太大

    # 获取到dataset
    train_dataset = CNewsDataset('data/cnews/cnews.train.txt')
    valid_dataset = CNewsDataset('data/cnews/cnews.val.txt')
    #test_data = load_data('cnews/cnews.test.txt')

    # 生成Batch
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)
    valid_dataloader = DataLoader(valid_dataset,
                                  batch_size=batch_size,
                                  shuffle=False)
    #test_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False)

    # 读取BERT的配置文件
    bert_config = BertConfig.from_pretrained('bert-base-chinese')
    num_labels = len(train_dataset.labels)

    # 初始化模型
    model = BertClassifier(bert_config, num_labels).to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    best_acc = 0

    for epoch in range(1, epochs + 1):
        losses = 0  # 损失
        accuracy = 0  # 准确率

        model.train()
        train_bar = tqdm(train_dataloader)
        for input_ids, token_type_ids, attention_mask, label_id in train_bar:
            model.zero_grad()
            train_bar.set_description('Epoch %i train' % epoch)

            output = model(
                input_ids=input_ids.to(device),
                attention_mask=attention_mask.to(device),
                token_type_ids=token_type_ids.to(device),
            )

            loss = criterion(output, label_id.to(device))
            losses += loss.item()

            pred_labels = torch.argmax(output, dim=1)  # 预测出的label
            acc = torch.sum(pred_labels == label_id.to(device)).item() / len(
                pred_labels)  #acc
            accuracy += acc

            loss.backward()
            optimizer.step()
            train_bar.set_postfix(loss=loss.item(), acc=acc)

        average_loss = losses / len(train_dataloader)
        average_acc = accuracy / len(train_dataloader)

        print('\tTrain ACC:', average_acc, '\tLoss:', average_loss)

        # 验证
        model.eval()
        losses = 0  # 损失
        accuracy = 0  # 准确率
        valid_bar = tqdm(valid_dataloader)
        for input_ids, token_type_ids, attention_mask, label_id in valid_bar:
            valid_bar.set_description('Epoch %i valid' % epoch)
            output = model(
                input_ids=input_ids.to(device),
                attention_mask=attention_mask.to(device),
                token_type_ids=token_type_ids.to(device),
            )

            loss = criterion(output, label_id.to(device))
            losses += loss.item()

            pred_labels = torch.argmax(output, dim=1)  # 预测出的label
            acc = torch.sum(pred_labels == label_id.to(device)).item() / len(
                pred_labels)  #acc
            accuracy += acc
            valid_bar.set_postfix(loss=loss.item(), acc=acc)

        average_loss = losses / len(valid_dataloader)
        average_acc = accuracy / len(valid_dataloader)

        print('\tValid ACC:', average_acc, '\tLoss:', average_loss)

        if average_acc > best_acc:
            best_acc = average_acc
            torch.save(model.state_dict(), 'models/best_model.pkl')
Exemple #18
0
class Predictor(object):
    def __init__(self, config):
        self.model = None
        self.config = config

        self.output_path = config["output_path"]
        self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt")
        self.label_to_index = self.load_vocab()
        self.index_to_label = {value: key for key, value in self.label_to_index.items()}
        self.word_vectors = None
        self.sequence_length = self.config["sequence_length"]

        # 创建模型
        self.create_model()
        # 加载计算图
        self.load_graph()

    def load_vocab(self):
        # 将词汇-索引映射表加载出来

        with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f:
            label_to_index = json.load(f)

        return label_to_index

    def padding(self, input_id, input_mask, segment_id):
        """
        对序列进行补全
        :param input_id:
        :param input_mask:
        :param segment_id:
        :return:
        """

        if len(input_id) < self.sequence_length:
            pad_input_id = input_id + [0] * (self.sequence_length - len(input_id))
            pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask))
            pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id))
        else:
            pad_input_id = input_id[:self.sequence_length]
            pad_input_mask = input_mask[:self.sequence_length]
            pad_segment_id = segment_id[:self.sequence_length]

        return pad_input_id, pad_input_mask, pad_segment_id

    def sentence_to_idx(self, text):
        """
        将分词后的句子转换成idx表示
        :return:
        """
        tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True)

        text = tokenization.convert_to_unicode(text)
        tokens = tokenizer.tokenize(text)
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_id)
        segment_id = [0] * len(input_id)

        input_id, input_mask, segment_id = self.padding(input_id, input_mask, segment_id)

        return [input_id], [input_mask], [segment_id]

    def load_graph(self):
        """
        加载计算图
        :return:
        """
        self.sess = tf.Session()
        ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"])
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print('Reloading model parameters..')
            self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"]))

    def create_model(self):
        """
                根据config文件选择对应的模型,并初始化
                :return:
                """
        self.model = BertClassifier(config=self.config, is_training=False)

    def predict(self, text):
        """
        给定分词后的句子,预测其分类结果
        :param text:
        :return:
        """
        input_ids, input_masks, segment_ids = self.sentence_to_idx(text)

        prediction = self.model.infer(self.sess,
                                      dict(input_ids=input_ids,
                                           input_masks=input_masks,
                                           segment_ids=segment_ids)).tolist()[0]
        label = self.index_to_label[prediction]
        return label
Exemple #19
0
 def create_model(self):
     """
             根据config文件选择对应的模型,并初始化
             :return:
             """
     self.model = BertClassifier(config=self.config, is_training=False)
Exemple #20
0
        total_correct += (res == gt).sum()
        total_count += gt.shape[0]
        total_loss.append(criterion(preds, labels).item())

    loss, acc = np.array(total_loss).mean(), total_correct / total_count
    print("Average Loss: {:.6f}, Accuracy: {:.6f}".format(loss, acc))
    return loss, acc


device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 30
best_acc = 0.0
eval_losses, eval_accs = [], []
train_losses, train_accs = [], []

model = BertClassifier(freeze_bert=False)
model = model.to(device)
# model = nn.DataParallel(model)

train_dataset = EmojiDataset('../../data/train_bert_sentences.npy',
                             '../../data/train_bert_labels.npy')
train_dataloader = DataLoader(train_dataset,
                              batch_size=64,
                              shuffle=False,
                              collate_fn=collate_fn)

test_dataset = EmojiDataset('../../data/test_bert_sentences.npy',
                            '../../data/test_bert_labels.npy')
test_dataloader = DataLoader(test_dataset,
                             batch_size=128,
                             shuffle=False,
Exemple #21
0
def main(args, f):
    # args = parse_arguments()
    set_seed(args.train_seed)

    if args.model in ['roberta', 'distilroberta']:
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # preprocess data
    src_eval_loader, src_loader, tgt_all_loader, tgt_train_loader = get_all_dataloader(
        args, tokenizer)

    # load models
    if args.model == 'bert':
        encoder = BertEncoder()
        src_encoder = BertEncoder()
        classifier = BertClassifier()
    elif args.model == 'distilbert':
        encoder = DistilBertEncoder()
        src_encoder = DistilBertEncoder()
        classifier = BertClassifier()
    elif args.model == 'roberta':
        encoder = RobertaEncoder()
        src_encoder = RobertaEncoder()
        classifier = RobertaClassifier()
    else:
        encoder = DistilRobertaEncoder()
        src_encoder = DistilRobertaEncoder()
        classifier = RobertaClassifier()

    # domain discriminator
    discriminator = AdversarialNetworkCdan(param.input_dim * param.num_labels,
                                           param.hidden_dim)

    # parallel models
    if torch.cuda.device_count() > 1:
        print('Let\'s use {} GPUs!'.format(torch.cuda.device_count()))
        encoder = nn.DataParallel(encoder)
        src_encoder = nn.DataParallel(src_encoder)
        classifier = nn.DataParallel(classifier)
        discriminator = nn.DataParallel(discriminator)

    if args.load:
        encoder = init_model(args,
                             encoder,
                             restore_path=param.src_encoder_path)
        src_encoder = init_model(args,
                                 src_encoder,
                                 restore_path=param.tgt_encoder_path)
        classifier = init_model(args,
                                classifier,
                                restore_path=param.src_classifier_path)
        # discriminator = init_model(args, discriminator, restore_path=param.d_model_path)
    else:
        encoder = init_model(args, encoder)
        src_encoder = init_model(args, src_encoder)
        classifier = init_model(args, classifier)

    discriminator = init_model(args, discriminator)

    # train source model
    print("=== Pretrain encoder for source domain ===")
    if args.pretrain:
        encoder, classifier = pretrain(args, encoder, classifier, src_loader)

    # eval source model
    print("=== Evaluating classifier for source domain ===")
    evaluate(args, encoder, classifier, src_loader)
    src_acc = evaluate(args, encoder, classifier, tgt_all_loader)
    f.write(f'{args.src} -> {args.tgt} no adapt acc on src data: {src_acc}\n')
    # x, y = save_features(args, encoder, src_loader)
    # np.savez(os.path.join(param.model_root, 's_feat_pretrain'), x, y)
    # x, y = save_features(args, encoder, tgt_all_loader)
    # np.savez(os.path.join(param.model_root, 't_feat_pretrain'), x, y)

    # adapt
    print("=== Adapt encoder for target domain ===")
    src_encoder.load_state_dict(encoder.state_dict())
    if args.src_free:
        # use the same encoder and copy encoder to src_encoder have different baseline results
        s_res_features = src_gmm(args, encoder, src_loader)
        src_loader = s_numpy_dataloader(s_res_features, args.batch_size)
        encoder, classifier = cdan_adapt_src_free(args, encoder, src_encoder,
                                                  discriminator, classifier,
                                                  src_loader, tgt_train_loader,
                                                  tgt_all_loader)
    elif args.data_free:
        s_res_features = src_gmm(args, encoder, src_loader)
        t_res_features = tgt_gmm(encoder, tgt_all_loader, 1)
        src_loader = s_numpy_dataloader(s_res_features, args.batch_size)
        tgt_train_loader = t_numpy_dataloader(t_res_features, args.batch_size)
        encoder, classifier = cdan_adapt_data_free(args, encoder,
                                                   discriminator, classifier,
                                                   src_loader,
                                                   tgt_train_loader,
                                                   tgt_all_loader)
    else:
        encoder, classifier = cdan_adapt(args, encoder, discriminator,
                                         classifier, src_loader,
                                         tgt_train_loader, tgt_all_loader)
    # x, y = save_features(args, encoder, src_loader)
    # np.savez(os.path.join(param.model_root, 's_feat_adapt_cdan'), x, y)
    # x, y = save_features(args, encoder, tgt_all_loader)
    # np.savez(os.path.join(param.model_root, 't_feat_adapt_cdan'), x, y)

    # argument setting
    print(
        f"model_type: {args.model}; batch_size: {args.batch_size}; data_free: {args.data_free}; "
        f"src_free: {args.src_free}; pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; "
        f"src: {args.src}; tgt: {args.tgt}; kd: {args.kd}; dp: {args.dp}; ent: {args.ent}"
    )

    # eval target encoder on lambda0.1 set of target dataset
    print("=== Evaluating classifier for encoded target domain ===")
    print(">>> domain adaption <<<")
    tgt_acc = evaluate(args, encoder, classifier, tgt_all_loader)
    f.write(f'{args.src} -> {args.tgt}: DA acc on tgt data: {tgt_acc}\n')
    f.write(
        f"model_type: {args.model}; batch_size: {args.batch_size}; data_free: {args.data_free}; "
        f"src_free: {args.src_free}; pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; "
        f"src: {args.src}; tgt: {args.tgt}; kd: {args.kd}; dp: {args.dp}; ent: {args.ent}\n\n"
    )
import torch
from model import BertClassifier
from transformers import BertTokenizer, BertConfig
from train import get_bert_input
import pandas as pd

labels = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经']

bert_config = BertConfig.from_pretrained('chinese_wwm_pytorch')
bert_config.num_labels = len(labels)
model = BertClassifier(bert_config)
model.load_state_dict(
    torch.load('./best_model_on_trainset.pkl',
               map_location=torch.device('cpu')))

tokenizer = BertTokenizer(vocab_file='chinese_wwm_pytorch/vocab.txt')

device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
model = torch.nn.DataParallel(model, device_ids=[2])
model.to(device)


def predict_text(text):
    input_id, attention_mask, token_type_id = get_bert_input(text, tokenizer)

    input_id = torch.tensor([input_id], dtype=torch.long)
    attention_mask = torch.tensor([attention_mask], dtype=torch.long)
    token_type_id = torch.tensor([token_type_id], dtype=torch.long)

    predicted = model(
        input_id,
    # model = EmbedCosSim(text_field, embedding_dim, use_glove=True, glove_dim=100,
    #                   checkpoint_name='checkpoints/embed_cos_sim_glove.pt')  # for training model with GloVe
    # model = RNNClassifier(text_field, embedding_dim, hidden_dim, rnn_type="GRU", bidir=False,
    #                      checkpoint_name='checkpoints/gru.pt')
    # in the above line, you can change rnn_type to either RNN_TANH, GRU, or LSTM to create a different network
    # you can also set bidir=True to create a bidirectional network
    # model = CNNClassifier(text_field, embedding_dim, num_filters=32, filter_sizes=[1, 2, 3, 5],
    #                      checkpoint_name='checkpoints/cnn.pt')
    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased',
                                                           do_lower=True)
    train_iter, val_iter, test_iter, text_field, label_field = prep_torch_data(
        batch_size=32, transformer_tokenize=tokenizer)
    bert = transformers.BertModel.from_pretrained('bert-base-uncased')
    for i in bert.parameters():
        i.requires_grad = False
    model = BertClassifier(bert, checkpoint_name='checkpoints/bert.pt')

    optimizer = optim.Adam(model.parameters())
    # move everything to gpu if available
    device = ("cuda" if torch.cuda.is_available() else "cpu")
    if device == "cuda":
        model.cuda()
        torch.set_default_tensor_type('torch.cuda.FloatTensor')

    train(model,
          train_iter,
          val_iter,
          test_iter,
          optimizer,
          criterion,
          n_epochs=50,
Exemple #24
0
                                 second_key=args.second_sentence,
                                 device="cuda:0")
    dev_dataset = make_dataset(tokenizer, dev_data, pos_label=args.pos_label, 
                               answer_field=args.answer_field, 
                               first_key=args.first_sentence,
                               second_key=args.second_sentence,
                               device="cuda:0")
    train_dataloader = make_dataloader(train_dataset, batch_size=args.train_batch_size)
    dev_dataloader = make_dataloader(dev_dataset, batch_size=args.dev_batch_size, shuffle=False)

    if args.batch_size is None:
        args.batch_size = args.train_batch_size
    if args.batch_size % args.train_batch_size != 0:
        raise ValueError("GPU batch size should divide batch size per update.")
    batches_per_update = args.batch_size // args.train_batch_size
    bert_classifier = BertClassifier(model, state_key="pooler_output", 
                                     lr=args.lr, accumulate_gradients=batches_per_update).to("cuda:0")

    best_score, best_weights = 0.0, None

    if args.load_file:
        bert_classifier.load_state_dict(torch.load(args.load_file))
    if args.train:
        model.train()
        for epoch in range(args.nepochs):
            progress_bar = tqdm.tqdm(train_dataloader)
            metrics = initialize_metrics()
            for i, batch in enumerate(progress_bar, 1):
                outputs = bert_classifier.train_on_batch(batch)
                postfix = update_metrics(metrics, outputs, batch["labels"])
                progress_bar.set_postfix(postfix)
                if (args.eval_every_n_batches > 0 and i % args.eval_every_n_batches == 0 and
Exemple #25
0
class Evaluator(object):
    def __init__(self, args):
        pretrain_name = 'bert-base-cased'
        if args.model_info.bert_path:
            pretrain_name = args.model_info.bert_path
        print(f"Tokenizer from:{pretrain_name}")
        train_conf = args.train_info
        model_conf = args.model_info
        self.model_type = model_conf.model
        if self.model_type == 'bert_seq':
            self.model = BertClassifier(model_conf)
            self.tokenizer = BertTokenizer.from_pretrained(pretrain_name)
            self.ds = SentimentDataset
        if self.model_type == 'GPT2':
            self.model = GPT2Classifier(model_conf)
            self.tokenizer = GPT2Tokenizer.from_pretrained(pretrain_name)
            self.ds = GPT2Dataset
        self.model.load_state_dict(torch.load(train_conf.model_path))
        self.device = train_conf.device
        self.class_num = model_conf.class_num
        self.model.to(self.device)
        self.lr = train_conf.lr
        self.max_len = train_conf.max_seq_len
        self.conf = args
        self.label_map = json.load(open(args.label_map_path))
        self.id2label = dict([(i, label_str)
                              for label_str, i in self.label_map.items()])

    def run(self, batch_size=64):
        test_path = self.conf.train_info.test_path
        test_loader = self.get_data_loader(test_path, batch_size)
        acc, recall, f1_score, cm, report, res = self.evaluate(test_loader)
        print(f"Accuracy score of the model is {acc}")
        print(f"Recall score of the model is {recall}")
        print(f"F1 score of the model is {f1_score}")
        print(f"Confusion matrix of the model is {cm}")
        print(report)
        dir_ = os.path.dirname(test_path)
        dir_ = os.path.dirname(dir_)
        dir_ = os.path.split(dir_)[0]
        new_path = os.path.join(dir_, 'logs', 'bad_case.json')
        f = open(new_path, 'w')
        for i in res:
            print(json.dumps(i, ensure_ascii=False), file=f)

    def evaluate(self, _loader):
        self.model.eval()
        y_true = list()
        y_pred = list()
        res = []
        with torch.no_grad():
            for batch in _loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                y = batch['labels']
                y = torch.squeeze(y, 1)
                y = y.to(self.device)
                logits = self.model(input_ids, attention_mask)
                y_true.append(y)
                y_pred.append(logits)
                pred_labels = torch.argmax(logits, dim=1)
                preds = pred_labels.cpu().numpy()
                true = batch['labels'].squeeze().numpy()
                if len(true) < 1:
                    continue
                for i, c_y in enumerate(true):
                    if c_y != preds[i]:
                        tmp_dict = {
                            'true_label': self.id2label[c_y],
                            'pred_label': self.id2label[preds[i]],
                            'text': batch['text'][i]
                        }
                        res.append(tmp_dict)
            y_true = torch.cat(y_true)
            y_pred = torch.cat(y_pred)
        cm = metrics.cal_cm(y_true, y_pred)
        acc_score = metrics.cal_accuracy(y_true, y_pred)
        recall = metrics.cal_recall(y_true, y_pred)
        f1_score = metrics.cal_f1(y_true, y_pred)
        label_range = [i for i in range(len(self.label_map))]
        target_name = [
            x[0] for x in sorted(self.label_map.items(), key=lambda x: x[1])
        ]
        report = metrics.get_classification_report(y_true, y_pred, label_range,
                                                   target_name)
        return acc_score, recall, f1_score, cm, report, res

    def get_data_loader(self, f_path, batch_size):
        np.random.seed(14)
        texts, labels = prepare(f_path, self.label_map)
        ds = self.ds(self.tokenizer, texts, labels, self.max_len)
        return dataloader.DataLoader(ds,
                                     batch_size=batch_size,
                                     num_workers=self.conf.num_workers,
                                     shuffle=True)
Exemple #26
0
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size = load_corpus(
    dataset)
'''
y_train, y_val, y_test: n*c matrices 
train_mask, val_mask, test_mask: n-d bool array
train_size, test_size: unused
'''

# compute number of real train/val/test/word nodes and number of classes
nb_node = adj.shape[0]
nb_train, nb_val, nb_test = train_mask.sum(), val_mask.sum(), test_mask.sum()
nb_word = nb_node - nb_train - nb_val - nb_test
nb_class = y_train.shape[1]

# instantiate model according to class number
model = BertClassifier(pretrained_model=bert_init, nb_class=nb_class)

# transform one-hot label to class ID for pytorch computation
y = th.LongTensor((y_train + y_val + y_test).argmax(axis=1))
label = {}
label['train'], label['val'], label['test'] = y[:nb_train], y[
    nb_train:nb_train + nb_val], y[-nb_test:]

# load documents and compute input encodings
corpus_file = './data/corpus/' + dataset + '_shuffle.txt'
with open(corpus_file, 'r') as f:
    text = f.read()
    text = text.replace('\\', '')
    text = text.split('\n')

Exemple #27
0
def main():
    args = parse_arguments()
    # argument setting
    print("=== Argument Setting ===")
    print("src: " + args.src)
    print("tgt: " + args.tgt)
    print("seed: " + str(args.seed))
    print("train_seed: " + str(args.train_seed))
    print("model_type: " + str(args.model))
    print("max_seq_length: " + str(args.max_seq_length))
    print("batch_size: " + str(args.batch_size))
    print("pre_epochs: " + str(args.pre_epochs))
    print("num_epochs: " + str(args.num_epochs))
    print("AD weight: " + str(args.alpha))
    print("KD weight: " + str(args.beta))
    print("temperature: " + str(args.temperature))
    set_seed(args.train_seed)

    if args.model in ['roberta', 'distilroberta']:
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # preprocess data
    print("=== Processing datasets ===")
    if args.src in ['blog', 'airline', 'imdb']:
        src_x, src_y = CSV2Array(
            os.path.join('data', args.src, args.src + '.csv'))
    else:
        src_x, src_y = XML2Array(
            os.path.join('data', args.src, 'negative.review'),
            os.path.join('data', args.src, 'positive.review'))

    src_x, src_test_x, src_y, src_test_y = train_test_split(
        src_x, src_y, test_size=0.2, stratify=src_y, random_state=args.seed)

    if args.tgt in ['blog', 'airline', 'imdb']:
        tgt_x, tgt_y = CSV2Array(
            os.path.join('data', args.tgt, args.tgt + '.csv'))
    else:
        tgt_x, tgt_y = XML2Array(
            os.path.join('data', args.tgt, 'negative.review'),
            os.path.join('data', args.tgt, 'positive.review'))

    tgt_train_x, tgt_test_y, tgt_train_y, tgt_test_y = train_test_split(
        tgt_x, tgt_y, test_size=0.2, stratify=tgt_y, random_state=args.seed)

    if args.model in ['roberta', 'distilroberta']:
        src_features = roberta_convert_examples_to_features(
            src_x, src_y, args.max_seq_length, tokenizer)
        src_test_features = roberta_convert_examples_to_features(
            src_test_x, src_test_y, args.max_seq_length, tokenizer)
        tgt_features = roberta_convert_examples_to_features(
            tgt_x, tgt_y, args.max_seq_length, tokenizer)
        tgt_train_features = roberta_convert_examples_to_features(
            tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer)
    else:
        src_features = convert_examples_to_features(src_x, src_y,
                                                    args.max_seq_length,
                                                    tokenizer)
        src_test_features = convert_examples_to_features(
            src_test_x, src_test_y, args.max_seq_length, tokenizer)
        tgt_features = convert_examples_to_features(tgt_x, tgt_y,
                                                    args.max_seq_length,
                                                    tokenizer)
        tgt_train_features = convert_examples_to_features(
            tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer)

    # load dataset

    src_data_loader = get_data_loader(src_features, args.batch_size)
    src_data_eval_loader = get_data_loader(src_test_features, args.batch_size)
    tgt_data_train_loader = get_data_loader(tgt_train_features,
                                            args.batch_size)
    tgt_data_all_loader = get_data_loader(tgt_features, args.batch_size)

    # load models
    if args.model == 'bert':
        src_encoder = BertEncoder()
        tgt_encoder = BertEncoder()
        src_classifier = BertClassifier()
    elif args.model == 'distilbert':
        src_encoder = DistilBertEncoder()
        tgt_encoder = DistilBertEncoder()
        src_classifier = BertClassifier()
    elif args.model == 'roberta':
        src_encoder = RobertaEncoder()
        tgt_encoder = RobertaEncoder()
        src_classifier = RobertaClassifier()
    else:
        src_encoder = DistilRobertaEncoder()
        tgt_encoder = DistilRobertaEncoder()
        src_classifier = RobertaClassifier()
    discriminator = Discriminator()

    if args.load:
        src_encoder = init_model(args,
                                 src_encoder,
                                 restore=param.src_encoder_path)
        src_classifier = init_model(args,
                                    src_classifier,
                                    restore=param.src_classifier_path)
        tgt_encoder = init_model(args,
                                 tgt_encoder,
                                 restore=param.tgt_encoder_path)
        discriminator = init_model(args,
                                   discriminator,
                                   restore=param.d_model_path)
    else:
        src_encoder = init_model(args, src_encoder)
        src_classifier = init_model(args, src_classifier)
        tgt_encoder = init_model(args, tgt_encoder)
        discriminator = init_model(args, discriminator)

    # train source model
    print("=== Training classifier for source domain ===")
    if args.pretrain:
        src_encoder, src_classifier = pretrain(args, src_encoder,
                                               src_classifier, src_data_loader)

    # eval source model
    print("=== Evaluating classifier for source domain ===")
    evaluate(src_encoder, src_classifier, src_data_loader)
    evaluate(src_encoder, src_classifier, src_data_eval_loader)
    evaluate(src_encoder, src_classifier, tgt_data_all_loader)

    for params in src_encoder.parameters():
        params.requires_grad = False

    for params in src_classifier.parameters():
        params.requires_grad = False

    # train target encoder by GAN
    print("=== Training encoder for target domain ===")
    if args.adapt:
        tgt_encoder.load_state_dict(src_encoder.state_dict())
        tgt_encoder = adapt(args, src_encoder, tgt_encoder, discriminator,
                            src_classifier, src_data_loader,
                            tgt_data_train_loader, tgt_data_all_loader)

    # eval target encoder on lambda0.1 set of target dataset
    print("=== Evaluating classifier for encoded target domain ===")
    print(">>> source only <<<")
    evaluate(src_encoder, src_classifier, tgt_data_all_loader)
    print(">>> domain adaption <<<")
    evaluate(tgt_encoder, src_classifier, tgt_data_all_loader)
Exemple #28
0
def run():
    def collate_fn(
            batch: List[Tuple[torch.LongTensor, torch.LongTensor]],
            device: torch.device) -> Tuple[torch.LongTensor, torch.LongTensor]:

        x, y = list(zip(*batch))
        x = pad_sequence(x, batch_first=True, padding_value=0)
        y = torch.stack(y)
        return x.to(device), y.to(device)

    df = pd.read_csv("../inputs/Train.csv")
    # test = pd.read_csv("../inputs/Test.csv")

    train_df, val_df = train_test_split(df,
                                        stratify=df.label,
                                        test_size=VALID_SIZE,
                                        random_state=SEED)

    labels = ["Depression", "Alcohol", "Suicide", "Drugs"]
    train = pd.concat([train_df["text"], pd.get_dummies(train_df['label'])\
               .reindex(columns=labels)], axis=1)#.reset_index(drop=True)

    valid = pd.concat([val_df["text"], pd.get_dummies(val_df['label'])\
               .reindex(columns=labels)], axis=1)#.reset_index(drop=True)

    if DEVICE == 'cpu':
        print('cpu')
    else:
        n_gpu = torch.cuda.device_count()
        print(torch.cuda.get_device_name(0))

    train_dataset = MentalHealthDataset(config.TOKENIZER, train, lazy=True)
    valid_dataset = MentalHealthDataset(config.TOKENIZER, valid, lazy=True)
    collate_fn = partial(collate_fn, device=DEVICE)

    train_sampler = RandomSampler(train_dataset)
    valid_sampler = RandomSampler(valid_dataset)

    train_iterator = DataLoader(train_dataset,
                                batch_size=config.TRAIN_BATCH_SIZE,
                                sampler=train_sampler,
                                collate_fn=collate_fn)

    valid_iterator = DataLoader(valid_dataset,
                                batch_size=config.VALID_BATCH_SIZE,
                                sampler=valid_sampler,
                                collate_fn=collate_fn)

    # model = BertClassifier().to(DEVICE)
    model = BertClassifier(BertModel.from_pretrained(config.BERT_PATH),
                           4).to(DEVICE)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    # triangular learning rate, linearly grows untill half of first epoch, then linearly decays
    warmup_steps = 10**3  # 10 ** 3
    total_steps = len(train_iterator) * config.EPOCHS - warmup_steps
    optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps,
                                                total_steps)
    # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps)

    # optimizer = torch.optim.Adam(model.parameters(), lr=LR) # 1e-4)
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min",
    #                                         patience=5, factor=0.3, min_lr=1e-10, verbose=True)

    for epoch in range(config.EPOCHS):
        print('=' * 5, f"EPOCH {epoch}", '=' * 5)
        engine.train_fn(train_iterator, model, optimizer, scheduler)
        engine.eval_fn(valid_iterator, model)

    model.eval()
    test_df = pd.read_csv("../inputs/Test.csv")
    submission = pd.read_csv('../inputs/SampleSubmission.csv')
    res = np.zeros((submission.shape[0], len(labels)))

    for i in tqdm(range(len(test_df) // config.TRAIN_BATCH_SIZE + 1)):
        batch_df = test_df.iloc[i * config.TRAIN_BATCH_SIZE:(i + 1) *
                                config.TRAIN_BATCH_SIZE]
        assert (batch_df["ID"] == submission["ID"]
                [i * config.TRAIN_BATCH_SIZE:(i + 1) *
                 config.TRAIN_BATCH_SIZE]).all(), f"Id mismatch"
        texts = []
        for text in batch_df["text"].tolist():
            text = config.TOKENIZER.encode(text, add_special_tokens=True)
            if len(text) > config.MAX_LEN:
                text = text[:config.MAX_LEN -
                            1] + [config.TOKENIZER.sep_token_id]
            texts.append(torch.LongTensor(text))
        x = pad_sequence(
            texts,
            batch_first=True,
            padding_value=config.TOKENIZER.pad_token_id).to(DEVICE)
        mask = (x != config.TOKENIZER.pad_token_id).float().to(DEVICE)

        with torch.no_grad():
            _, outputs = model(x, attention_mask=mask)
        outputs = outputs.cpu().numpy()
        submission.loc[i *
                       config.TRAIN_BATCH_SIZE:(i * config.TRAIN_BATCH_SIZE +
                                                len(outputs) - 1),
                       labels] = outputs

    submission.to_csv("../subs/submission_2.csv", index=False)