Ejemplo n.º 1
0
class Predictor(object):
    def __init__(self, args):
        pretrain_name = 'bert-base-cased'
        if args.model_info.bert_path:
            pretrain_name = args.model_info.bert_path
        self.tokenizer = BertTokenizer.from_pretrained(pretrain_name)
        print(f"Tokenizer from:{pretrain_name}")
        train_conf = args.train_info
        model_conf = args.model_info
        self.device = train_conf.device
        self.class_num = model_conf.class_num
        self.model = BertClassifier(model_conf)
        self.model.load_state_dict(
            torch.load(train_conf.model_path,
                       map_location=torch.device(self.device)))
        self.model.to(self.device)
        self.lr = train_conf.lr
        self.max_len = train_conf.max_seq_len
        self.conf = args
        self.label_map = json.load(open(args.label_map_path))
        self.id2label = dict([(i, label_str)
                              for label_str, i in self.label_map.items()])
        self.softmax = Softmax(dim=1)

    def predict(self, sens):
        d_loader = self.sen_2_dl(sens)
        y_pred = list()
        with torch.no_grad():
            for batch in d_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                logits = self.model(input_ids, attention_mask)
                logits = torch.sigmoid(logits)
                y_pred.append(logits)
        y_pred = torch.cat(y_pred)
        y_pred = y_pred.cpu().numpy()
        res = list()
        for y in y_pred:
            res.append(self._score_2_dict(y))
        return res

    def _score_2_dict(self, single_pred):
        res = dict()
        for i, score in enumerate(single_pred):
            label_str = self.id2label[i]
            res[label_str] = float(score)
        return res

    def sen_2_dl(self, sens):
        texts = [i.strip() for i in sens]
        labels = [
            999
        ]  # this is a invalid parameter but dataloader needs the this
        ds = SentimentDataset(self.tokenizer, texts, labels, self.max_len)
        _loader = dataloader.DataLoader(
            ds, batch_size=self.conf.train_info.batch_size, shuffle=False)
        return _loader
Ejemplo n.º 2
0
class Evaluator(object):
    def __init__(self, args):
        pretrain_name = 'bert-base-cased'
        if args.model_info.bert_path:
            pretrain_name = args.model_info.bert_path
        print(f"Tokenizer from:{pretrain_name}")
        train_conf = args.train_info
        model_conf = args.model_info
        self.model_type = model_conf.model
        if self.model_type == 'bert_seq':
            self.model = BertClassifier(model_conf)
            self.tokenizer = BertTokenizer.from_pretrained(pretrain_name)
            self.ds = SentimentDataset
        if self.model_type == 'GPT2':
            self.model = GPT2Classifier(model_conf)
            self.tokenizer = GPT2Tokenizer.from_pretrained(pretrain_name)
            self.ds = GPT2Dataset
        self.model.load_state_dict(torch.load(train_conf.model_path))
        self.device = train_conf.device
        self.class_num = model_conf.class_num
        self.model.to(self.device)
        self.lr = train_conf.lr
        self.max_len = train_conf.max_seq_len
        self.conf = args
        self.label_map = json.load(open(args.label_map_path))
        self.id2label = dict([(i, label_str)
                              for label_str, i in self.label_map.items()])

    def run(self, batch_size=64):
        test_path = self.conf.train_info.test_path
        test_loader = self.get_data_loader(test_path, batch_size)
        acc, recall, f1_score, cm, report, res = self.evaluate(test_loader)
        print(f"Accuracy score of the model is {acc}")
        print(f"Recall score of the model is {recall}")
        print(f"F1 score of the model is {f1_score}")
        print(f"Confusion matrix of the model is {cm}")
        print(report)
        dir_ = os.path.dirname(test_path)
        dir_ = os.path.dirname(dir_)
        dir_ = os.path.split(dir_)[0]
        new_path = os.path.join(dir_, 'logs', 'bad_case.json')
        f = open(new_path, 'w')
        for i in res:
            print(json.dumps(i, ensure_ascii=False), file=f)

    def evaluate(self, _loader):
        self.model.eval()
        y_true = list()
        y_pred = list()
        res = []
        with torch.no_grad():
            for batch in _loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                y = batch['labels']
                y = torch.squeeze(y, 1)
                y = y.to(self.device)
                logits = self.model(input_ids, attention_mask)
                y_true.append(y)
                y_pred.append(logits)
                pred_labels = torch.argmax(logits, dim=1)
                preds = pred_labels.cpu().numpy()
                true = batch['labels'].squeeze().numpy()
                if len(true) < 1:
                    continue
                for i, c_y in enumerate(true):
                    if c_y != preds[i]:
                        tmp_dict = {
                            'true_label': self.id2label[c_y],
                            'pred_label': self.id2label[preds[i]],
                            'text': batch['text'][i]
                        }
                        res.append(tmp_dict)
            y_true = torch.cat(y_true)
            y_pred = torch.cat(y_pred)
        cm = metrics.cal_cm(y_true, y_pred)
        acc_score = metrics.cal_accuracy(y_true, y_pred)
        recall = metrics.cal_recall(y_true, y_pred)
        f1_score = metrics.cal_f1(y_true, y_pred)
        label_range = [i for i in range(len(self.label_map))]
        target_name = [
            x[0] for x in sorted(self.label_map.items(), key=lambda x: x[1])
        ]
        report = metrics.get_classification_report(y_true, y_pred, label_range,
                                                   target_name)
        return acc_score, recall, f1_score, cm, report, res

    def get_data_loader(self, f_path, batch_size):
        np.random.seed(14)
        texts, labels = prepare(f_path, self.label_map)
        ds = self.ds(self.tokenizer, texts, labels, self.max_len)
        return dataloader.DataLoader(ds,
                                     batch_size=batch_size,
                                     num_workers=self.conf.num_workers,
                                     shuffle=True)
Ejemplo n.º 3
0
                               device="cuda:0")
    train_dataloader = make_dataloader(train_dataset, batch_size=args.train_batch_size)
    dev_dataloader = make_dataloader(dev_dataset, batch_size=args.dev_batch_size, shuffle=False)

    if args.batch_size is None:
        args.batch_size = args.train_batch_size
    if args.batch_size % args.train_batch_size != 0:
        raise ValueError("GPU batch size should divide batch size per update.")
    batches_per_update = args.batch_size // args.train_batch_size
    bert_classifier = BertClassifier(model, state_key="pooler_output", 
                                     lr=args.lr, accumulate_gradients=batches_per_update).to("cuda:0")

    best_score, best_weights = 0.0, None

    if args.load_file:
        bert_classifier.load_state_dict(torch.load(args.load_file))
    if args.train:
        model.train()
        for epoch in range(args.nepochs):
            progress_bar = tqdm.tqdm(train_dataloader)
            metrics = initialize_metrics()
            for i, batch in enumerate(progress_bar, 1):
                outputs = bert_classifier.train_on_batch(batch)
                postfix = update_metrics(metrics, outputs, batch["labels"])
                progress_bar.set_postfix(postfix)
                if (args.eval_every_n_batches > 0 and i % args.eval_every_n_batches == 0 and
                            len(train_dataloader) - i >= args.eval_every_n_batches // 2) or\
                        i == len(train_dataloader):
                    dev_metrics = initialize_metrics()
                    dev_progress_bar = tqdm.tqdm(dev_dataloader)
                    for j, batch in enumerate(dev_progress_bar):
Ejemplo n.º 4
0
import torch
from model import BertClassifier
from transformers import BertTokenizer, BertConfig
from train import get_bert_input
import pandas as pd

labels = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经']

bert_config = BertConfig.from_pretrained('chinese_wwm_pytorch')
bert_config.num_labels = len(labels)
model = BertClassifier(bert_config)
model.load_state_dict(
    torch.load('./best_model_on_trainset.pkl',
               map_location=torch.device('cpu')))

tokenizer = BertTokenizer(vocab_file='chinese_wwm_pytorch/vocab.txt')

device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
model = torch.nn.DataParallel(model, device_ids=[2])
model.to(device)


def predict_text(text):
    input_id, attention_mask, token_type_id = get_bert_input(text, tokenizer)

    input_id = torch.tensor([input_id], dtype=torch.long)
    attention_mask = torch.tensor([attention_mask], dtype=torch.long)
    token_type_id = torch.tensor([token_type_id], dtype=torch.long)

    predicted = model(
        input_id,
Ejemplo n.º 5
0
# coding: utf-8
# @File: predict.py
# @Author: HE D.H.
# @Email: [email protected]
# @Time: 2020/10/10 17:13:57
# @Description:

import torch
from model import BertClassifier
from transformers import BertTokenizer, BertConfig

labels = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经']
bert_config = BertConfig.from_pretrained('bert-base-chinese')

model = BertClassifier(bert_config, len(labels))
model.load_state_dict(
    torch.load('models/best_model.pkl', map_location=torch.device('cpu')))

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

print('新闻类别分类')
while True:
    text = input('Input: ')
    token = tokenizer(text,
                      add_special_tokens=True,
                      padding='max_length',
                      truncation=True,
                      max_length=512)
    input_ids = token['input_ids']
    attention_mask = token['attention_mask']
    token_type_ids = token['token_type_ids']
Ejemplo n.º 6
0
    attention_mask = (x != 0).float().to(config.DEVICE).long()
    outputs = MODEL(x, attention_mask=attention_mask)
    return outputs.cpu().detach().numpy()


@app.route('/predict')
def predict():
    comment = request.args.get('comment')
    start_time = time.time()
    prediction = comment_prediction(comment)
    response = {
        'response': {
            label: str(prob)
            for label, prob in zip(config.CLASS_COLS, prediction[0])
        }
    }
    response['response']['comment'] = comment
    response['response']['time_taken'] = str(time.time() - start_time)

    return flask.jsonify(response)


if __name__ == '__main__':
    bert_config = BertConfig.from_pretrained(config.BERT_NAME)
    bert_config.num_labels = config.NUM_CLASSES
    MODEL = BertClassifier(bert_config)
    MODEL.load_state_dict(torch.load(config.TRAINED_MODEL_PATH))
    MODEL.to(config.DEVICE)
    MODEL.eval()
    app.run(host=config.HOST, port=config.PORT)
Ejemplo n.º 7
0
class NLU:
    def __init__(self, config, RecognitionResultManager):
        self.config = config
        self.genre_keywords_db = self.get_db(self.config['DB']['host'],
                                             self.config['DB']['db_name'],
                                             self.config['DB']['user'])
        self.rrm = RecognitionResultManager

        self.model_path = "/Users/jinsakuma/Downloads/model_gpu_v4.3.3.pth"
        self.model_config = BertConfig.from_pretrained(
            'cl-tohoku/bert-base-japanese-whole-word-masking',
            output_attentions=True)
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(
            'cl-tohoku/bert-base-japanese-whole-word-masking')
        self.bert_model = BertModel.from_pretrained(
            'cl-tohoku/bert-base-japanese-whole-word-masking',
            config=self.model_config)
        self.model = BertClassifier(self.bert_model)
        self.max_len = 30
        self.load_weights(self.model_path)

        self.device = torch.device("cpu")

        self.order_list = [
            'recommendation', 'title', 'abstract', 'review', 'evaluation',
            'actor', 'genre', 'director', None
        ]

    def get_db(self, host="localhost", db_name="woz_system", user="******"):
        '''
        MySQLから発話内容を一括取得
        :return: db (dict)
        '''
        connector = MySQLdb.connect(host=host,
                                    db=db_name,
                                    user=user,
                                    passwd="",
                                    charset="utf8")
        cursor = connector.cursor()  # カーソル(概念)を作成
        # 映画推薦用キーワード
        cursor.execute('select * from genre')
        genres = cursor.fetchall()
        genre_keywords_db = {}
        for genre in genres:
            genre_id = genre[1]
            genre_type = genre[2]  # .encode('utf-8')
            genre_keywords_db[genre_type] = []
            cursor.execute(
                'select keywords from genre_keywords where genre_id={}'.format(
                    genre_id))
            keywords = cursor.fetchall()
            keyword_list = keywords[0][0].split(",")
            genre_keywords_db[genre_type] = keyword_list
        return genre_keywords_db

    def load_weights(self, model_path):
        load_weights = torch.load(model_path, map_location={'cuda:0': 'cpu'})
        self.model.load_state_dict(load_weights)

    def bert_tokenizer(self, input_text):
        return self.tokenizer.encode(input_text,
                                     max_length=self.max_len,
                                     truncation=True,
                                     return_tensors='pt')[0]

    def get_order(self, input_text):
        token = self.bert_tokenizer(input_text)
        token = token.unsqueeze(0)
        output, attentions = self.model(token.to(self.device))
        _, pred = torch.max(output, 1)

        print("NLU result: ", self.order_list[pred.item()])
        return self.order_list[pred.item()]

    def get_text(self, N):
        df = self.rrm.get_df()
        text_list = df['transcript'].iloc[-N:].tolist()
        target_list = df['speaker'].iloc[-N:].tolist()
        return text_list, target_list

    def check_genre(self, input_texts):
        # キーワードマッチング
        for text in reversed(input_texts):
            for response_type, keywords in self.genre_keywords_db.items():
                for keyword in keywords:
                    if keyword in text:
                        return response_type

        return None