Ejemplo n.º 1
0
    def initialize_detector(self):
        t1 = time.time()
        try:
            import kenlm
        except ImportError:
            raise ImportError(
                'mypycorrector dependencies are not fully installed, '
                'they are required for statistical language model.'
                'Please use "pip install kenlm" to install it.'
                'if you are Win, Please install kenlm in cgwin.')

        self.lm = kenlm.Model(self.language_model_path)
        logger.debug('Loaded language model: %s, spend: %s s' %
                     (self.language_model_path, str(time.time() - t1)))

        # 词、频数dict
        t2 = time.time()
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        self.char_freq = self.load_char_freq_dict(self.char_freq_path)
        t3 = time.time()
        logger.debug(
            'Loaded word freq, char freq file: %s, size: %d, spend: %s s' %
            (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_word_freq), str(t5 - t4)))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        # bert预训练模型
        t6 = time.time()
        self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab)
        self.MASK_TOKEN = "[MASK]"
        self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids(
            [self.MASK_TOKEN])[0]
        # Prepare model
        self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
        logger.debug("Loaded model ok, path: %s, spend: %.3f s." %
                     (self.bert_model_dir, time.time() - t6))
        self.initialized_detector = True
def main(args):

    if args.dataset == 'sim-R':
        from BERTDST_utils.simR_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'sim-M':
        from BERTDST_utils.simM_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'DSTC2':
        from BERTDST_utils.DSTC2_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'WOZ2.0':
        from BERTDST_utils.WOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'MultiWOZ2.1':
        from BERTDST_utils.MultiWOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, OP, make_slot_meta
        ontology = json.load(open(args.ontology_data_path))
        SLOT, ontology = make_slot_meta(ontology)

    slot_meta = SLOT
    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)
    data = prepare_dataset(1.0, args.test_data_path, tokenizer, slot_meta,
                           args.test_size_window, args.max_seq_length,
                           args.test_MG)

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = 0.1
    op2id = OP
    model = MGDST(model_config, len(op2id), len(slot_meta))
    ckpt = torch.load(args.model_ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)

    model.eval()
    model.to(device)

    model_evaluation(make_turn_label, postprocessing, state_equal, OP, model,
                     data, tokenizer, slot_meta, 0, args.test_size_window,
                     args.test_MG)
Ejemplo n.º 3
0
    def run(self):
        remote_helper.get_remote_date(
            "https://www.flyai.com/m/chinese_base.zip")
        before_vocab_dir = os.path.join(os.getcwd(), 'vocab.txt')
        after_vocab_dir = os.path.join(args.bert_model_dir, 'vocab.txt')
        logger.info('>before_vocab_dir:{}'.format(before_vocab_dir))
        logger.info('>after_vocab_dir:{}'.format(after_vocab_dir))

        shutil.copyfile(before_vocab_dir, after_vocab_dir)

        if not os.path.exists(self.arguments.output_dir):
            os.mkdir(self.arguments.output_dir)

        self.arguments.BATCH = self.arguments.BATCH // self.arguments.gradient_accumulation_steps

        # 数据准备  分词器选择
        tokenizer = BertTokenizer(
            self.arguments.bert_vocab_file).from_pretrained(
                self.arguments.bert_model_dir,
                do_lower_case=self.arguments.do_lower_case)
        # 获取数据 news/keywords
        train_news, train_category, dev_news, dev_category = self.generate()

        self.train(Net=Net,
                   train_category=train_category,
                   dev_category=dev_category,
                   train_news=train_news,
                   dev_news=dev_news,
                   tokenizer=tokenizer)
 def load_tokenizer(self):
     if self.model_configuration.is_xlnet:
         self.tokenizer = XLNetTokenizer.from_pretrained(self.model_configuration.bert_model,
                                                         do_lower_case=self.model_configuration.do_lower)
     elif not self.model_configuration.is_scibert:
         self.tokenizer = BertTokenizer.from_pretrained(self.model_configuration.bert_model,
                                                        do_lower_case=self.model_configuration.do_lower)
     else:
         self.tokenizer = BertTokenizer(self.model_configuration.vocab_file,
                                        do_lower_case=self.model_configuration.do_lower)
Ejemplo n.º 5
0
 def initialize_bert_detector(self):
     t1 = time.time()
     self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab)
     self.MASK_TOKEN = "[MASK]"
     self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids(
         [self.MASK_TOKEN])[0]
     # Prepare model
     self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
     logger.debug("Loaded model ok, path: %s, spend: %.3f s." %
                  (self.bert_model_dir, time.time() - t1))
     self.initialized_bert_detector = True
Ejemplo n.º 6
0
 def fit(self, tokens):
     # NOTE: We allow the model to use default: do_basic_tokenize.
     # This potentially splits tokens into more tokens apart from subtokens:
     # eg. Mr.Doe -> Mr . D ##oe  (Note that . is not preceded by ##)
     # We take this into account when creating the token_flags in
     # function text_to_token_flags
     self.tokenizer = BertTokenizer(
         self.bert_filename,
         # do_basic_tokenize=self.do_basic_tokenize,
         do_lower_case=self.do_lower_case)
     return self
Ejemplo n.º 7
0
def init_params():
    processors = {"sentiment_analysis": SentiAnalysisProcessor}
    task_name = args.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()
    if args.model_type == 'bert':
        tokenizer = BertTokenizer(vocab_file=args.VOCAB_FILE)
    elif args.model_type == 'xlnet':
        tokenizer = XLNetTokenizer.from_pretrained(
            os.path.join(args.ROOT_DIR, args.xlnet_model),
            do_lower_case=args.do_lower_case)
    return processor, tokenizer
Ejemplo n.º 8
0
 def __init__(self,
              pretrained_model=None,
              vocab_file=None,
              do_lower_case=True,
              max_len=None,
              do_basic_tokenize=True,
              never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
     if pretrained_model:
         self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
         if "uncased" not in pretrained_model:
             self.tokenizer.basic_tokenizer.do_lower_case = False
     else:
         self.tokenizer = BertTokenizer(vocab_file, do_lower_case,
                                        do_basic_tokenize)
     self.vocab_size = len(self.tokenizer.vocab)
     self.never_split = never_split
Ejemplo n.º 9
0
    def __init__(self, args):
        try:
            from pytorch_transformers import BertTokenizer
            from pytorch_transformers.tokenization_utils import clean_up_tokenization
        except ImportError:
            raise ImportError(
                'Please install 1.0.0 version of pytorch_transformers'
                'with: pip install pytorch-transformers')

        if 'bpe_vocab_file' in args:
            self.bert_tokenizer = BertTokenizer(
                args.bpe_vocab_file, do_lower_case=not args.bpe_cased)
        else:
            vocab_file_name = 'bert-base-cased' if args.bpe_cased else 'bert-base-uncased'
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                vocab_file_name)
            self.clean_up_tokenization = clean_up_tokenization
Ejemplo n.º 10
0
def main(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ontology = json.load(open(os.path.join(args.data_root,
                                           args.ontology_data)))
    slot_meta, _ = make_slot_meta(ontology)
    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)
    data = prepare_dataset(os.path.join(args.data_root,
                                        args.test_data), tokenizer, slot_meta,
                           args.n_history, args.max_seq_length, args.op_code)

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = 0.1
    op2id = OP_SET[args.op_code]
    model = TransformerDST(model_config, len(op2id), len(domain2id),
                           op2id['update'])
    ckpt = torch.load(args.model_ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)

    model.eval()
    model.to(device)

    if args.eval_all:
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, False, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, False, True)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, True, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, True, True)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, False, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, True, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, False, True)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, True, True)
    else:
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         args.gt_op, args.gt_p_state, args.gt_gen)
Ejemplo n.º 11
0
    def __init__(self, model_directory, vocab_file, lower=False):

        # Load pre-trained model (weights)

        self.model = BertForMaskedLM.from_pretrained(model_directory)
        self.model.eval()
        self.cuda = torch.cuda.is_available()
        if self.cuda:
            self.model = self.model.cuda()

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = BertTokenizer(vocab_file=vocab_file,
                                       do_lower_case=lower)

        self.CLS = '[CLS]'
        self.SEP = '[SEP]'
        self.MASK = '[MASK]'
        self.mask_id = self.tokenizer.convert_tokens_to_ids([self.MASK])[0]
        self.sep_id = self.tokenizer.convert_tokens_to_ids([self.SEP])[0]
        self.cls_id = self.tokenizer.convert_tokens_to_ids([self.CLS])[0]
    def init(self):
        bert_config = BertConfig(self.args.output_config_file)
        if os.path.exists(self.args.output_model_file):
            if self.args.model_name == 'BertCNNPlus':
                bert_config.filter_num = self.args.filter_num
                bert_config.filter_sizes = [int(val) for val in self.args.filter_sizes.split()]
            elif self.args.model_name == 'BertRCNN':
                bert_config.rnn_hidden_size = self.args.rnn_hidden_size
                bert_config.num_layers = self.args.num_layers
                bert_config.bidirectional = self.args.bidirectional
                bert_config.dropout = self.args.dropout
            else:
                pass

            self.model = Net(config=bert_config)
            self.model.load_state_dict(torch.load(self.args.output_model_file))
            self.model.to(DEVICE)

        self.tokenizer = BertTokenizer(self.args.bert_vocab_file).from_pretrained(self.args.bert_model_dir,
                                                                                  do_lower_case=self.args.do_lower_case)
Ejemplo n.º 13
0
 def __init__(self):
     self.use_gpu = t.cuda.is_available()
     self.vocab_root = "../kernel/vocab.txt"
     self.bert_config_root = "../kernel/bert_config.json"
     self.pretrained_bert_root = "../kernel/chr_idiombert.bin"
     self.raw_test_data_root = "../data/test.txt"
     self.test_ans_root = "../kernel/dev_ans.csv"
     self.idiom_vocab_root = "../kernel/idiomList.txt"
     self.prob_file = "../kernel/prob.csv"
     self.data_root = "../kernel/"
     self.split_test_data_root = "../kernel/split_test_data.json"
     self.tokenizer = BertTokenizer(vocab_file=self.vocab_root)
     self.num_workers = 4
     self.test_batch_size = 512
     self.max_seq_length = 128
     with open(self.data_root + "idiom2index", mode="rb") as f1:
         self.idiom2index = pickle.load(f1)
     with open(self.data_root + "index2idiom", mode="rb") as f2:
         self.index2idiom = pickle.load(f2)
     self.hidden_dropout_prob = 0.5
     self.use_gpu = t.cuda.is_available()
     self.device = t.device("cuda" if t.cuda.is_available() else "cpu")
Ejemplo n.º 14
0
def main(args):
    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    if args.dataset == 'sim-R':
        from BERTDST_utils.simR_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'sim-M':
        from BERTDST_utils.simM_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'DSTC2':
        from BERTDST_utils.DSTC2_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'WOZ2.0':
        from BERTDST_utils.WOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'MultiWOZ2.1':
        from BERTDST_utils.MultiWOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, OP, make_slot_meta
        ontology = json.load(open(args.ontology_data_path))
        SLOT, ontology = make_slot_meta(ontology)

    n_gpu = 0
    if torch.cuda.is_available():
        n_gpu = torch.cuda.device_count()

    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)
    if n_gpu > 0:
        torch.cuda.manual_seed(args.random_seed)
        torch.cuda.manual_seed_all(args.random_seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    slot_meta = SLOT
    op2id = OP
    print(op2id)
    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)

    train_data_raw = prepare_dataset(data_scale=args.train_scale,
                                     data_path=args.train_data_path,
                                     tokenizer=tokenizer,
                                     slot_meta=slot_meta,
                                     size_window=args.train_size_window,
                                     max_seq_length=args.max_seq_length,
                                     multi_granularity=args.train_MG,
                                     data_type='train')

    train_data = MultiWozDataset(train_data_raw, tokenizer, slot_meta,
                                 args.max_seq_length, rng, args.word_dropout)
    print("# train examples %d" % len(train_data_raw))

    dev_data_raw = prepare_dataset(data_scale=1.0,
                                   data_path=args.dev_data_path,
                                   tokenizer=tokenizer,
                                   slot_meta=slot_meta,
                                   size_window=args.test_size_window,
                                   max_seq_length=args.max_seq_length,
                                   multi_granularity=args.test_MG,
                                   data_type='dev')
    print("# dev examples %d" % len(dev_data_raw))

    test_data_raw = prepare_dataset(data_scale=1.0,
                                    data_path=args.test_data_path,
                                    tokenizer=tokenizer,
                                    slot_meta=slot_meta,
                                    size_window=args.test_size_window,
                                    max_seq_length=args.max_seq_length,
                                    multi_granularity=args.test_MG,
                                    data_type='test')
    print("# test examples %d" % len(test_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    model_config.hidden_dropout_prob = args.hidden_dropout_prob
    model = MGDST(model_config, len(op2id), len(slot_meta))

    ckpt = torch.load(args.bert_ckpt_path, map_location='cpu')
    ckpt1 = {
        k.replace('bert.', '').replace('gamma',
                                       'weight').replace('beta', 'bias'): v
        for k, v in ckpt.items() if 'cls.' not in k
    }
    model.encoder.bert.load_state_dict(ckpt1)
    #model.encoder.bert.from_pretrained(args.bert_ckpt_path)

    model.to(device)

    num_train_steps = int(
        len(train_data_raw) / args.batch_size * args.n_epochs)

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    enc_param_optimizer = list(model.encoder.named_parameters())
    enc_optimizer_grouped_parameters = [{
        'params': [
            p for n, p in enc_param_optimizer
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr)
    enc_scheduler = WarmupLinearSchedule(enc_optimizer,
                                         int(num_train_steps *
                                             args.enc_warmup),
                                         t_total=num_train_steps)

    dec_param_optimizer = list(model.decoder.parameters())
    dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr)
    dec_scheduler = WarmupLinearSchedule(dec_optimizer,
                                         int(num_train_steps *
                                             args.dec_warmup),
                                         t_total=num_train_steps)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size,
                                  collate_fn=train_data.collate_fn,
                                  num_workers=args.num_workers,
                                  worker_init_fn=worker_init_fn)

    loss_fnc = nn.CrossEntropyLoss()
    best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0}
    total_step = 0
    for epoch in range(args.n_epochs):
        batch_loss = []
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch = [
                b.to(device) if not isinstance(b, int) else b for b in batch
            ]
            input_ids, input_mask, segment_ids, op_ids, gen_ids = batch

            state_scores, span_scores = model(input_ids=input_ids,
                                              token_type_ids=segment_ids,
                                              attention_mask=input_mask)

            loss_state = loss_fnc(
                state_scores.contiguous().view(-1, len(op2id)),
                op_ids.contiguous().view(-1))
            try:
                loss_span = masked_cross_entropy_for_value(
                    span_scores.contiguous(), gen_ids.contiguous(),
                    tokenizer.vocab['[PAD]'])
            except Exception as e:
                print(e)
            loss = loss_state * 0.8 + loss_span * 0.2
            batch_loss.append(loss.item())

            loss.backward()
            enc_optimizer.step()
            enc_scheduler.step()
            dec_optimizer.step()
            dec_scheduler.step()
            model.zero_grad()

            total_step += 1

            if step % 100 == 0:
                print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, span_loss : %.3f" \
                          % (epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_state.item(), loss_span.item()))
                batch_loss = []

        if (epoch + 1) % args.eval_epoch == 0:
            print('total_step: ', total_step)
            eval_res = model_evaluation(make_turn_label, postprocessing,
                                        state_equal, OP, model, dev_data_raw,
                                        tokenizer, slot_meta, epoch + 1,
                                        args.test_size_window, args.test_MG)
            if eval_res['joint_acc'] > best_score['joint_acc']:
                best_score = eval_res
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                save_path = os.path.join(
                    args.save_dir,
                    'model_best_gran[%s]_scale[%s]_seed[%s].bin' %
                    (str(args.train_size_window), str(
                        args.train_scale), args.random_seed))
                torch.save(model_to_save.state_dict(), save_path)
            print("Best Score : ", best_score)
            print("\n")

            if epoch > args.patience_start_epoch and best_score[
                    'epoch'] + args.patience < epoch:
                print("out of patience...")
                break

    print("Test using best model...")
    best_epoch = best_score['epoch']
    ckpt_path = os.path.join(
        args.save_dir, 'model_best_gran[%s]_scale[%s]_seed[%s].bin' %
        (str(args.train_size_window), str(args.train_scale), args.random_seed))
    model = MGDST(model_config, len(op2id), len(slot_meta))
    ckpt = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)
    model.to(device)

    model_evaluation(make_turn_label, postprocessing, state_equal, OP, model,
                     test_data_raw, tokenizer, slot_meta, best_epoch,
                     args.test_size_window, args.test_MG)
Ejemplo n.º 15
0
 def load(self, filename):
     self.tokenizer = BertTokenizer(
         filename,
         # do_basic_tokenize=self.do_basic_tokenize,
         do_lower_case=self.do_lower_case)
     return self
Ejemplo n.º 16
0
 def __init__(self, vocab_path, do_lower_case, min_freq_words=None):
     self.tokenizer = BertTokenizer(vocab_file=vocab_path,
                                    do_lower_case=do_lower_case)
Ejemplo n.º 17
0
def main(args):
    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    n_gpu = 0
    if torch.cuda.is_available():
        n_gpu = torch.cuda.device_count()
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)
    if n_gpu > 0:
        torch.cuda.manual_seed(args.random_seed)
        torch.cuda.manual_seed_all(args.random_seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    ontology = json.load(open(args.ontology_data))
    slot_meta, ontology = make_slot_meta(ontology)
    op2id = OP_SET[args.op_code]
    print(op2id)
    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)

    train_data_raw = prepare_dataset(data_path=args.train_data_path,
                                     tokenizer=tokenizer,
                                     slot_meta=slot_meta,
                                     n_history=args.n_history,
                                     max_seq_length=args.max_seq_length,
                                     op_code=args.op_code)

    train_data = MultiWozDataset(train_data_raw,
                                 tokenizer,
                                 slot_meta,
                                 args.max_seq_length,
                                 rng,
                                 ontology,
                                 args.word_dropout,
                                 args.shuffle_state,
                                 args.shuffle_p)
    print("# train examples %d" % len(train_data_raw))

    dev_data_raw = prepare_dataset(data_path=args.dev_data_path,
                                   tokenizer=tokenizer,
                                   slot_meta=slot_meta,
                                   n_history=args.n_history,
                                   max_seq_length=args.max_seq_length,
                                   op_code=args.op_code)
    print("# dev examples %d" % len(dev_data_raw))

    test_data_raw = prepare_dataset(data_path=args.test_data_path,
                                    tokenizer=tokenizer,
                                    slot_meta=slot_meta,
                                    n_history=args.n_history,
                                    max_seq_length=args.max_seq_length,
                                    op_code=args.op_code)
    print("# test examples %d" % len(test_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    model_config.hidden_dropout_prob = args.hidden_dropout_prob

    model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain)

    if not os.path.exists(args.bert_ckpt_path):
        args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets')

    ckpt = torch.load(args.bert_ckpt_path, map_location='cpu')
    model.encoder.bert.load_state_dict(ckpt)

    # re-initialize added special tokens ([SLOT], [NULL], [EOS])
    model.encoder.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02)
    model.encoder.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
    model.encoder.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)
    model.to(device)

    num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs)

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    enc_param_optimizer = list(model.encoder.named_parameters())
    enc_optimizer_grouped_parameters = [
        {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr)
    enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup),
                                         t_total=num_train_steps)

    dec_param_optimizer = list(model.decoder.parameters())
    dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr)
    dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup),
                                         t_total=num_train_steps)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size,
                                  collate_fn=train_data.collate_fn,
                                  num_workers=args.num_workers,
                                  worker_init_fn=worker_init_fn)

    loss_fnc = nn.CrossEntropyLoss()
    best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0}
    for epoch in range(args.n_epochs):
        batch_loss = []
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch = [b.to(device) if not isinstance(b, int) else b for b in batch]
            input_ids, input_mask, segment_ids, state_position_ids, op_ids,\
            domain_ids, gen_ids, max_value, max_update = batch

            if rng.random() < args.decoder_teacher_forcing:  # teacher forcing
                teacher = gen_ids
            else:
                teacher = None

            domain_scores, state_scores, gen_scores = model(input_ids=input_ids,
                                                            token_type_ids=segment_ids,
                                                            state_positions=state_position_ids,
                                                            attention_mask=input_mask,
                                                            max_value=max_value,
                                                            op_ids=op_ids,
                                                            max_update=max_update,
                                                            teacher=teacher)

            loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1))
            loss_g = masked_cross_entropy_for_value(gen_scores.contiguous(),
                                                    gen_ids.contiguous(),
                                                    tokenizer.vocab['[PAD]'])
            loss = loss_s + loss_g
            if args.exclude_domain is not True:
                loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1))
                loss = loss + loss_d
            batch_loss.append(loss.item())

            loss.backward()
            enc_optimizer.step()
            enc_scheduler.step()
            dec_optimizer.step()
            dec_scheduler.step()
            model.zero_grad()

            if step % 100 == 0:
                if args.exclude_domain is not True:
                    print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \
                          % (epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_s.item(), loss_g.item(), loss_d.item()))
                else:
                    print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \
                          % (epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_s.item(), loss_g.item()))
                batch_loss = []

        if (epoch+1) % args.eval_epoch == 0:
            eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code)
            if eval_res['joint_acc'] > best_score['joint_acc']:
                best_score = eval_res
                model_to_save = model.module if hasattr(model, 'module') else model
                save_path = os.path.join(args.save_dir, 'model_best.bin')
                torch.save(model_to_save.state_dict(), save_path)
            print("Best Score : ", best_score)
            print("\n")

    print("Test using best model...")
    best_epoch = best_score['epoch']
    ckpt_path = os.path.join(args.save_dir, 'model_best.bin')
    model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain)
    ckpt = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)
    model.to(device)

    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=False, is_gt_p_state=False, is_gt_gen=False)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=False, is_gt_p_state=False, is_gt_gen=True)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=False, is_gt_p_state=True, is_gt_gen=False)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=False, is_gt_p_state=True, is_gt_gen=True)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=True, is_gt_p_state=False, is_gt_gen=False)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=True, is_gt_p_state=True, is_gt_gen=False)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=True, is_gt_p_state=False, is_gt_gen=True)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=True, is_gt_p_state=True, is_gt_gen=True)
Ejemplo n.º 18
0
 def __init__(self, vocab_path, do_lower_case):
     self.tokenizer = BertTokenizer(vocab_path, do_lower_case)
Ejemplo n.º 19
0
def get_tokenizer(vocab_file=None):
    vocab_file = VOCAB_FILE if vocab_file is None else vocab_file
    os.path.isfile(vocab_file)
    tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False)
    return tokenizer
def main(log_in_file, lm_path, lm_type, data_path, usegpu, n_fold, total_step,
         eval_every, early_stop, lr, weight_decay, lr_decay_in_layers,
         wd_decay_in_layers, max_length, max_title_rate, content_head_rate,
         batch_size, lr_scheduler_type, input_pattern, clean_method,
         warmup_rate, classifier_dropout, classifier_active, seed):
    arg_name_value_pairs = deepcopy(locals())
    prefix = time.strftime('%Y%m%d_%H%M')
    logger = logging.getLogger('default')
    formatter = logging.Formatter("%(asctime)s %(message)s")
    if log_in_file:
        handler1 = logging.FileHandler(prefix + '.log')
        handler1.setFormatter(formatter)
        handler1.setLevel(logging.DEBUG)
        logger.addHandler(handler1)
    handler2 = logging.StreamHandler()
    handler2.setFormatter(formatter)
    handler2.setLevel(logging.DEBUG)
    logger.addHandler(handler2)
    logger.setLevel(logging.DEBUG)
    for arg_name, arg_value in arg_name_value_pairs.items():
        logger.info(f'{arg_name}: {arg_value}')
    global tokenizer
    if lm_type == 'bert':
        tokenizer = BertTokenizer(os.path.join(lm_path, 'vocab.txt'))
    else:
        tokenizer = XLNetTokenizer(os.path.join(lm_path, 'spiece.model'))
        global PAD, PAD_t, CLS_t, SEP_t
        PAD_t = '<pad>'
        CLS_t = '<cls>'
        SEP_t = '<sep>'
        PAD = tokenizer.convert_tokens_to_ids([PAD_t])[0]
    logger.info(f'padding token is {PAD}')
    processed_train = preprocess(
        os.path.join(data_path, 'Train_DataSet.csv'),
        os.path.join(data_path,
                     'Train_DataSet_Label.csv'), tokenizer, max_length,
        input_pattern, clean_method, max_title_rate, content_head_rate, logger)
    processed_test = preprocess(os.path.join(data_path, 'Test_DataSet.csv'),
                                False, tokenizer, max_length, input_pattern,
                                clean_method, max_title_rate,
                                content_head_rate, logger)
    logger.info('seed everything and create model')
    seed_everything(seed)
    no_decay = ['.bias', 'LayerNorm.bias', 'LayerNorm.weight']
    if lm_type == 'xlnet':
        model = XLNetForSequenceClassification.from_pretrained(
            lm_path, num_labels=3, summary_last_dropout=classifier_dropout)
        if classifier_active == 'relu':
            model.sequence_summary.activation = nn.ReLU()
        if usegpu:
            model = model.cuda()
        model_layer_names = [
            'transformer.mask_emb', 'transformer.word_embedding.weight'
        ]
        model_layer_names += [
            f'transformer.layer.{i}.' for i in range(model.config.n_layer)
        ]
        model_layer_names += ['sequence_summary.summary', 'logits_proj']
    else:
        model = BertForSequenceClassification.from_pretrained(
            lm_path, num_labels=3, hidden_dropout_prob=classifier_dropout)
        if classifier_active == 'relu':
            model.bert.pooler.activation = nn.ReLU()
        if usegpu:
            model = model.cuda()
        model_layer_names = ['bert.embeddings']
        model_layer_names += [
            'bert.encoder.layer.{}.'.format(i)
            for i in range(model.config.num_hidden_layers)
        ]
        model_layer_names += ['bert.pooler', 'classifier']
    optimizer = optimizer = AdamW([{
        'params': [
            p for n, p in model.named_parameters()
            if layer_name in n and not any(nd in n for nd in no_decay)
        ],
        'lr':
        lr * (lr_decay_in_layers**i),
        'weight_decay':
        weight_decay * (wd_decay_in_layers**i)
    } for i, layer_name in enumerate(model_layer_names[::-1])] + [{
        'params': [
            p for n, p in model.named_parameters()
            if layer_name in n and any(nd in n for nd in no_decay)
        ],
        'lr':
        lr * (lr_decay_in_layers**i),
        'weight_decay':
        .0
    } for i, layer_name in enumerate(model_layer_names[::-1])])
    if lr_scheduler_type == 'linear':
        lr_scheduler = WarmupLinearSchedule(optimizer,
                                            warmup_steps=warmup_rate,
                                            t_total=total_step)
    elif lr_scheduler_type == 'constant':
        lr_scheduler = WarmupConstantSchedule(optimizer,
                                              warmup_steps=warmup_rate)
    else:
        raise ValueError

    model_state_0 = deepcopy(model.state_dict())
    optimizer_state_0 = deepcopy(optimizer.state_dict())

    test_iter = get_data_iter(processed_test,
                              batch_size * 4,
                              collect_test_func,
                              shuffle=False)
    pred = np.zeros((len(processed_test), 3))
    val_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(
            KFold(n_splits=n_fold, shuffle=True,
                  random_state=seed).split(processed_train)):
        model.load_state_dict(model_state_0)
        optimizer.load_state_dict(optimizer_state_0)
        if lr_scheduler_type == 'linear':
            lr_scheduler = WarmupLinearSchedule(optimizer,
                                                warmup_steps=warmup_rate,
                                                t_total=total_step)
        elif lr_scheduler_type == 'constant':
            lr_scheduler = WarmupConstantSchedule(optimizer,
                                                  warmup_steps=warmup_rate)
        else:
            raise ValueError
        train_iter = get_data_iter([processed_train[i] for i in train_idx],
                                   batch_size, collect_func)
        val_iter = get_data_iter([processed_train[i] for i in val_idx],
                                 batch_size * 4,
                                 collect_func,
                                 shuffle=False)

        best_model, best_score = training(model=model,
                                          optimizer=optimizer,
                                          lr_scheduler=lr_scheduler,
                                          train_iter=train_iter,
                                          val_iter=val_iter,
                                          total_step=total_step,
                                          tokenizer=tokenizer,
                                          usegpu=usegpu,
                                          eval_every=eval_every,
                                          logger=logger,
                                          early_stop=early_stop,
                                          fold_idx=fold_idx)
        model.load_state_dict(best_model)
        val_scores.append(best_score)
        pred += predict(model, test_iter, usegpu)
    logger.info(f'average: {np.mean(val_scores):.6f}')
    pred = pred / n_fold
    prob_df = pd.DataFrame()
    submit = pd.DataFrame()
    submit['id'] = [i['id'] for i in processed_test]
    submit['label'] = pred.argmax(-1)
    prob_df['id'] = [i['id'] for i in processed_test]
    prob_df['0'] = pred[:, 0]
    prob_df['1'] = pred[:, 1]
    prob_df['2'] = pred[:, 2]
    submit.to_csv(f'submit_{prefix}.csv', index=False)
    prob_df.to_csv(f'probability_{prefix}.csv', index=False)
def main():
    parser = argparse.ArgumentParser()

    # model
    parser.add_argument('--model', type=str, default='wordrnn')
    parser.add_argument('--dir', type=str, default=None)
    parser.add_argument('--tokenizer',
                        type=str,
                        default='nltk',
                        help='Only effective when model set to wordrnn')
    parser.add_argument('--criterion', type=str, default='full')

    # data
    parser.add_argument('--set', type=str, default='msr')
    parser.add_argument('--partition', type=str, default='va')
    parser.add_argument('--no-move-cached', action='store_true')

    parser.add_argument('--log-dir', type=str, default='train/noname')
    parser.add_argument('--save-pred', action='store_true')

    args = parser.parse_args()

    problem_set = ProblemSet.load(args.set)
    examples = problem_set.get_examples(args.partition)

    logger.info("Evaluating models saved in {} on {}-{}".format(
        args.dir, args.set, args.partition))

    if not os.path.exists(args.log_dir):
        logger.info("Creating directory at {}".format(args.log_dir))
        os.makedirs(args.log_dir)

    args_path = os.path.join(args.log_dir, 'args.json')
    with open(args_path, 'w') as f:
        logger.info("Saving arguments at {}".format(args_path))
        json.dump(vars(args), f, indent=2)

    log_path = os.path.join(args.log_dir, 'log.txt')
    file_handler = logging.FileHandler(log_path, mode='w')
    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)

    model_type = args.model.lower()
    if model_type == 'wordrnn':
        args_path = osp.join(args.dir, 'args.json')
        with open(args_path, 'r') as f:
            arg_dict = json.load(f)

        vocab_path = osp.join(args.dir, 'vocab.txt')
        vocab = load_vocab(vocab_path)
        if args.tokenizer.lower() == 'nltk':
            tokenizer = NLTKTokenizer(vocab, arg_dict['lower'])
        elif args.tokenizer.lower() == 'wordpiece':
            tokenizer = BertTokenizer(vocab_path, arg_dict['lower'])
        model = WordRNN(len(vocab), len(vocab), arg_dict['rnncell'],
                        arg_dict['emsize'], arg_dict['outsize'],
                        arg_dict['nhid'],
                        arg_dict['nlayers'], arg_dict['bidirec'],
                        arg_dict.get('autoenc',
                                     False), arg_dict['decoder_bias'])
        logger.info(model)

        ckpt_paths = glob.glob(osp.join(args.dir, '*.pt'))
        ckpt_paths.sort(key=osp.getmtime)
        for path in ckpt_paths:
            model.load_state_dict(torch.load(path))
            direction = 'autoenc' if model.autoenc else (
                'bidirec' if model.bidirec else 'forward')
            evaluate(examples, model, tokenizer, direction, args.criterion,
                     str(osp.basename(path.split('.')[0])))
            if args.save_pred:
                save_fn = osp.basename(path).replace('.pt', '.csv')
                save_preds(examples, osp.join(args.log_dir, save_fn))
    elif model_type == 'lm1b':
        lm1b_dir = settings['lm1b_dir']

        for e in examples:
            e.context[0] = ' '.join(['<S>', e.context[0]])
            e.context[-1] = ' '.join([e.context[-1], '</S>'])

        vocab = load_vocab(osp.join(lm1b_dir, 'vocab-2016-09-10.txt'))
        special_tokens = ['<S>', '</S>', '<UNK>']
        tokenizer = BaseTokenizer(vocab, False, '<UNK>', special_tokens)
        in_vocab = load_vocab(osp.join(lm1b_dir, args.dir, 'vocab.txt'))

        out_to_in = [in_vocab['<UNK>']] * 800000
        for i, token in tokenizer.ids_to_tokens.items():
            out_to_in[i] = in_vocab.get(token, in_vocab['<UNK>'])

        tf_path = osp.join(lm1b_dir, 'ckpt-*')
        npy_path = osp.join(lm1b_dir, args.dir, 'embeddings.npy')
        model = LM1B.from_tf(tf_path, npy_path, out_to_in, 8)
        logger.info(model)

        evaluate(examples, model, tokenizer, 'forward', args.criterion)
        if args.save_pred:
            save_preds(examples, osp.join(args.log_dir, 'preds.csv'))
    else:
        cache_dir = settings['pretrans_dir']
        bert_dir = osp.join(settings['pretrans_dir'], args.dir)
        model_or_dir = bert_dir if osp.exists(bert_dir) else args.dir

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
        config = config_class.from_pretrained(model_or_dir,
                                              cache_dir=cache_dir)
        tokenizer = tokenizer_class.from_pretrained(
            model_or_dir,
            cache_dir=cache_dir,
            max_len=config.max_position_embeddings,
            do_lower_case='-uncased' in model_or_dir)
        model = model_class.from_pretrained(model_or_dir,
                                            cache_dir=cache_dir,
                                            config=config)

        direction = 'forward'
        if model_type == 'bert':
            direction = 'autoenc'

        evaluate(examples, model, tokenizer, direction, args.criterion)
        if args.save_pred:
            save_preds(examples, osp.join(args.log_dir, 'preds.csv'))

        if not args.no_move_cached and not osp.exists(bert_dir):
            logger.info("Creating directory at {}".format(bert_dir))
            os.mkdir(bert_dir)

            model_url = model.pretrained_model_archive_map[model_or_dir]
            model_path = osp.join(bert_dir, WEIGHTS_NAME)
            move_cached(model_url, cache_dir, model_path)

            config_url = model.config.pretrained_config_archive_map[
                model_or_dir]
            config_path = osp.join(bert_dir, CONFIG_NAME)
            move_cached(config_url, cache_dir, config_path)

            for k, url_map in tokenizer.pretrained_vocab_files_map.items():
                vocab_path = osp.join(bert_dir, tokenizer.vocab_files_names[k])
                move_cached(url_map[model_or_dir], cache_dir, vocab_path)
Ejemplo n.º 22
0

print("Len= ", len(tag2idx))

tag2name={tag2idx[key] : key for key in tag2idx.keys()}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()


vocabulary = "bert_models/vocab.txt"


max_len  = 45

tokenizer=BertTokenizer(vocab_file=vocabulary,do_lower_case=False)


tokenized_texts = []
word_piece_labels = []
i_inc = 0
for word_list,label in (zip(sentences,labels)):
    temp_lable = []
    temp_token = []
    
    # Add [CLS] at the front 
    temp_lable.append('[CLS]')
    temp_token.append('[CLS]')
    
    for word,lab in zip(word_list,label):
        token_list = tokenizer.tokenize(word)
Ejemplo n.º 23
0
def main(args):

    assert args.use_one_optim is True

    if args.recover_e > 0:
        raise NotImplementedError("This option is from my oldest code version. "
                                  "I have not checked it for this code version.")

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)
        print("### mkdir {:}".format(args.save_dir))

    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    n_gpu = 0
    if torch.cuda.is_available() and (not args.use_cpu):
        n_gpu = torch.cuda.device_count()
        device = torch.device('cuda')
        print("### Device: {:}".format(device))
    else:
        print("### Use CPU (Debugging)")
        device = torch.device("cpu")

    if args.random_seed < 0:
        print("### Pick a random seed")
        args.random_seed = random.sample(list(range(1, 100000)), 1)[0]

    print("### Random Seed: {:}".format(args.random_seed))
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)

    if n_gpu > 0:
        if args.random_seed >= 0:
            torch.cuda.manual_seed(args.random_seed)
            torch.cuda.manual_seed_all(args.random_seed)

        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    ontology = json.load(open(args.ontology_data))
    slot_meta, ontology = make_slot_meta(ontology)
    op2id = OP_SET[args.op_code]
    print(op2id)

    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)

    train_path = os.path.join(args.data_root, "train.pt")
    train_data_raw = torch.load(train_path)[:5000]
    print("# train examples %d" % len(train_data_raw))

    test_path = os.path.join(args.data_root, "test.pt")
    test_data_raw = torch.load(test_path)
    print("# test examples %d" % len(test_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    model_config.hidden_dropout_prob = args.hidden_dropout_prob

    type_vocab_size = 4
    dec_config = args
    model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id),
                           op2id['update'],
                           tokenizer.convert_tokens_to_ids(['[MASK]'])[0],
                           tokenizer.convert_tokens_to_ids(['[SEP]'])[0],
                           tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                           tokenizer.convert_tokens_to_ids(['-'])[0],
                           type_vocab_size, args.exclude_domain)

    test_epochs = [int(e) for e in args.load_epoch.strip().lower().split('-')]
    for best_epoch in test_epochs:
        print("### Epoch {:}...".format(best_epoch))
        sys.stdout.flush()
        ckpt_path = os.path.join(args.save_dir, 'model.e{:}.bin'.format(best_epoch))
        ckpt = torch.load(ckpt_path, map_location='cpu')
        model.load_state_dict(ckpt)
        model.to(device)

        # eval_res = model_evaluation(model, train_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
        #                             use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, n_gpu=n_gpu,
        #                             is_gt_op=False, is_gt_p_state=False, is_gt_gen=False)
        #
        # print("### Epoch {:} Train Score : ".format(best_epoch), eval_res)
        # print('\n'*2)
        # sys.stdout.flush()

        eval_res = model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                                    use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, n_gpu=n_gpu,
                                    is_gt_op=False, is_gt_p_state=False, is_gt_gen=False)

        print("### Epoch {:} Test Score : ".format(best_epoch), eval_res)
        print('\n'*2)
        sys.stdout.flush()
Ejemplo n.º 24
0
def test():
    # torch.autograd.set_detect_anomaly(True)
    load_save_model = False
    lr = 1e-5
    batch_size = 4
    gpu = True
    torch.manual_seed(0)
    device = torch.device('cpu')
    if gpu:
        device = torch.device('cuda')

    tokenizer = BertTokenizer(vocab_file='publish/vocab.txt', max_len=512)
    dataset, known_token = load_dataset('TRAIN/Train_reviews.csv', 'TRAIN/Train_labels.csv', tokenizer)
    train_dataset, validate_dataset = split_dataset(dataset, 'TRAIN/shuffle.idx', 0.97)

    bert_pretraining = convert_tf_checkpoint_to_pytorch('./publish/bert_model.ckpt', './publish/bert_config.json')
    model = Model(bert_pretraining.bert)

    # tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', cache_dir='bert-base-chinese')
    train_dataset = Dataset(train_dataset)
    train_dataloader = torch_data.DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn(tokenizer)
    )
    validate_dataset = Dataset(validate_dataset)
    validate_dataloader = torch_data.DataLoader(
        dataset=validate_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn(tokenizer)
    )

    model = model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.01)

    statistic = {
        'best_f1': -100,
        'best_f1_epoch': None,
        'best_match_f1': -100,
        'best_match_epoch': None,
        'epoch_detail': []
    }

    if load_save_model:
        model.load_state_dict(torch.load('./save_model/best.model'))

    for epoch in range(15):
        print(str(epoch) + '------------------------------------------------------------------')
        accum_total_loss = 0
        accum_seq_labeling_loss = 0
        accum_match_loss = 0
        accum_category_loss = 0
        accum_polarity_loss = 0

        model.train()
        pbar = tqdm()
        try:
            for step, (batch_X, len_X, mask, gather_idx, targets) in enumerate(train_dataloader):
                batch_X = batch_X.to(device)
                mask = mask.to(device)
                # tokenizer.decode(list(batch_X[0].cpu().numpy())).replace(' ', '')
                scores, gather_idx = model(batch_X, len_X, mask, gather_idx)
                loss = model.loss(scores, targets, mask)

                optimizer.zero_grad()
                loss[0].backward()
                optimizer.step()

                accum_total_loss += loss[0].cpu().detach().numpy()
                accum_seq_labeling_loss += loss[1].cpu().detach().numpy()
                if type(loss[2]) is not int:
                    accum_match_loss += loss[2].cpu().detach().numpy()
                accum_category_loss += loss[3].cpu().detach().numpy()
                accum_polarity_loss += loss[4].cpu().detach().numpy()

                pbar.update(batch_size)
                pbar.set_description('step: %d, total loss: %f, seq loss: %f, match loss: %f, category loss: %f, polarity loss: %f' % \
                    (step, accum_total_loss / (step + 1), accum_seq_labeling_loss / (step + 1), accum_match_loss / (step + 1),\
                        accum_category_loss / (step + 1), accum_polarity_loss / (step + 1)))
        except KeyboardInterrupt:
            pbar.close()
            raise
        pbar.close()
        optimizer.zero_grad()
        loss_statistic = {
            'total_loss': accum_total_loss / (step + 1),
            'seq_loss': accum_seq_labeling_loss / (step + 1),
            'match_loss': accum_match_loss / (step + 1),
            'category_loss': accum_category_loss / (step + 1),
            'polarity_loss': accum_polarity_loss / (step + 1)
        }
        
        
        model.eval()
        total_gt_seq_target = []
        total_gt_match_target = []
        total_gt_single_aspect_category_target = []
        total_gt_single_opinion_category_target = []
        total_gt_cross_category_target = []
        total_gt_single_aspect_polarity_target = []
        total_gt_single_opinion_polarity_target = []
        total_gt_cross_polarity_target = []

        total_pred_seq_target = []
        total_pred_match_target = []
        total_pred_single_aspect_category_target = []
        total_pred_single_opinion_category_target = []
        total_pred_cross_category_target = []
        total_pred_single_aspect_polarity_target = []
        total_pred_single_opinion_polarity_target = []
        total_pred_cross_polarity_target = []
        pbar = tqdm()
        try:
            for step, (batch_X, len_X, mask, gather_idx, targets) in enumerate(validate_dataloader):
                batch_X = batch_X.to(device)
                mask = mask.to(device)

                scores, gather_idx = model(batch_X, len_X, mask, gather_idx)

                (pred_seq_target, pred_match_target, pred_single_aspect_category_target, pred_single_opinion_category_target,\
                    pred_cross_category_target, pred_single_aspect_polarity_target, pred_single_opinion_polarity_target,\
                        pred_cross_polarity_target) = model.infer(scores, mask)

                (seq_target, match_target, single_aspect_category_target, single_opinion_category_target, cross_category_target,\
                    single_aspect_polarity_target, single_opinion_polarity_target, cross_polarity_target) = targets

                total_pred_seq_target.append(pred_seq_target.view(-1).cpu().detach().numpy())
                total_gt_seq_target.append(seq_target.view(-1).cpu().detach().numpy())

                for b in range(len(pred_match_target)):
                    if pred_match_target[b] is not None:
                        assert match_target[b].numel() != 0
                        total_pred_match_target.append(pred_match_target[b].view(-1).cpu().detach().numpy())
                        total_gt_match_target.append(match_target[b].view(-1).cpu().detach().numpy())

                    if pred_single_aspect_category_target[b] is not None:
                        total_pred_single_aspect_category_target.append(pred_single_aspect_category_target[b].cpu().detach().numpy())
                        total_gt_single_aspect_category_target.append(single_aspect_category_target[b].cpu().detach().numpy())
                    if pred_single_opinion_category_target[b] is not None:
                        total_pred_single_opinion_category_target.append(pred_single_opinion_category_target[b].cpu().detach().numpy())
                        total_gt_single_opinion_category_target.append(single_opinion_category_target[b].cpu().detach().numpy())
                    if pred_cross_category_target[b] is not None:
                        total_pred_cross_category_target.append(pred_cross_category_target[b].view(-1).cpu().detach().numpy())
                        total_gt_cross_category_target.append(cross_category_target[b].view(-1).cpu().detach().numpy())
                    if pred_single_aspect_polarity_target[b] is not None:
                        total_pred_single_aspect_polarity_target.append(pred_single_aspect_polarity_target[b].cpu().detach().numpy())
                        total_gt_single_aspect_polarity_target.append(single_aspect_polarity_target[b].cpu().detach().numpy())
                    if pred_single_opinion_polarity_target[b] is not None:
                        total_pred_single_opinion_polarity_target.append(pred_single_opinion_polarity_target[b].cpu().detach().numpy())
                        total_gt_single_opinion_polarity_target.append(single_opinion_polarity_target[b].cpu().detach().numpy())
                    if pred_cross_polarity_target[b] is not None:
                        total_pred_cross_polarity_target.append(pred_cross_polarity_target[b].view(-1).cpu().detach().numpy())
                        total_gt_cross_polarity_target.append(cross_polarity_target[b].view(-1).cpu().detach().numpy())

                pbar.update(batch_size)
                pbar.set_description('step: %d' % step)
        except KeyboardInterrupt:
            pbar.close()
            raise
        pbar.close()

        total_gt_seq_target = np.concatenate(total_gt_seq_target)
        total_gt_match_target = np.concatenate(total_gt_match_target)
        total_gt_single_aspect_category_target = np.concatenate(total_gt_single_aspect_category_target)
        total_gt_single_opinion_category_target = np.concatenate(total_gt_single_opinion_category_target)
        total_gt_cross_category_target = np.concatenate(total_gt_cross_category_target)
        total_gt_single_aspect_polarity_target = np.concatenate(total_gt_single_aspect_polarity_target)
        total_gt_single_opinion_polarity_target = np.concatenate(total_gt_single_opinion_polarity_target)
        total_gt_cross_polarity_target = np.concatenate(total_gt_cross_polarity_target)

        total_pred_seq_target = np.concatenate(total_pred_seq_target)
        total_pred_match_target = np.concatenate(total_pred_match_target)
        total_pred_single_aspect_category_target = np.concatenate(total_pred_single_aspect_category_target)
        total_pred_single_opinion_category_target = np.concatenate(total_pred_single_opinion_category_target)
        total_pred_cross_category_target = np.concatenate(total_pred_cross_category_target)
        total_pred_single_aspect_polarity_target = np.concatenate(total_pred_single_aspect_polarity_target)
        total_pred_single_opinion_polarity_target = np.concatenate(total_pred_single_opinion_polarity_target)
        total_pred_cross_polarity_target = np.concatenate(total_pred_cross_polarity_target)

        total_gt_category_target = np.concatenate((total_gt_single_aspect_category_target, total_gt_single_opinion_category_target,\
            total_gt_cross_category_target))
        total_pred_category_target = np.concatenate((total_pred_single_aspect_category_target, total_pred_single_opinion_category_target,\
            total_pred_cross_category_target))
        total_gt_polarity_target = np.concatenate((total_gt_single_aspect_polarity_target, total_gt_single_opinion_polarity_target,\
            total_gt_cross_polarity_target))
        total_pred_polarity_target = np.concatenate((total_pred_single_aspect_polarity_target, total_gt_single_opinion_polarity_target,\
            total_pred_cross_polarity_target))

        seq_metric = seq_f1(total_pred_seq_target, total_gt_seq_target)
        match_f1 = f1_score(total_gt_match_target, total_pred_match_target)
        match_p = precision_score(total_gt_match_target, total_pred_match_target)
        match_r = recall_score(total_gt_match_target, total_pred_match_target)
        category_f1 = seq_f1(total_pred_category_target, total_gt_category_target, 'macro')
        polarity_f1 = seq_f1(total_pred_polarity_target, total_gt_polarity_target, 'macro')
        print('Others: %f, B_A: %f, I_A: %f, B_O: %f, I_O: %f, ' % tuple(seq_metric), end='')
        
        print('match: %f, ' % match_f1, end='')
        print('match p: %f, ' % match_p, end='')
        print('match r: %f, ' % match_r, end='')
        print('category: %f, ' % category_f1, end='')
        print('polarity: %f, ' % polarity_f1, end='')

        epoch_statistic = {
            'seq_metric': tuple(seq_metric),
            'seq':  'Others: %f, B_A: %f, I_A: %f, B_O: %f, I_O: %f, ' % tuple(seq_metric),
            'match': match_f1,
            'match_p': match_p,
            'match_r': match_r,
            'category:': category_f1,
            'polarity': polarity_f1,
            'loss': loss_statistic
        }
        avg_f1 = (np.sum(seq_metric) + match_f1 + category_f1 + polarity_f1) / 8
        print('avg: %f' % avg_f1)
        if avg_f1 > statistic['best_f1']:
            statistic['best_f1'] = avg_f1
            statistic['best_f1_epoch'] = epoch
            torch.save(model.state_dict(), 'save_model/best.model')
        if match_f1 > statistic['best_match_f1']:
            statistic['best_match_f1'] = match_f1
            statistic['best_match_epoch'] = epoch
            torch.save(model.state_dict(), 'save_model/best_match.model')
        statistic['epoch_detail'].append(epoch_statistic)
Ejemplo n.º 25
0
def main(args):

    assert args.use_one_optim is True

    if args.use_cls_only:
        args.no_dial = True

    print("### use_cls_only: {:}".format(args.use_cls_only))
    print("### no_dial: {:}".format(args.no_dial))

    if args.recover_e > 0:
        raise NotImplementedError("This option is from my oldest code version. "
                                  "I have not checked it for this code version.")

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)
        print("### mkdir {:}".format(args.save_dir))

    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    n_gpu = 0
    if torch.cuda.is_available() and (not args.use_cpu):
        n_gpu = torch.cuda.device_count()
        device = torch.device('cuda')
        print("### Device: {:}".format(device))
    else:
        print("### Use CPU (Debugging)")
        device = torch.device("cpu")

    if args.random_seed < 0:
        print("### Pick a random seed")
        args.random_seed = random.sample(list(range(0, 100000)), 1)[0]

    print("### Random Seed: {:}".format(args.random_seed))
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)

    if n_gpu > 0:
        if args.random_seed >= 0:
            torch.cuda.manual_seed(args.random_seed)
            torch.cuda.manual_seed_all(args.random_seed)

        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    ontology = json.load(open(args.ontology_data))
    slot_meta, ontology = make_slot_meta(ontology)
    op2id = OP_SET[args.op_code]
    print(op2id)

    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)

    train_path = os.path.join(args.data_root, "train.pt")
    dev_path = os.path.join(args.data_root, "dev.pt")
    test_path = os.path.join(args.data_root, "test.pt")

    if not os.path.exists(test_path):
        test_data_raw = prepare_dataset(data_path=args.test_data_path,
                                        tokenizer=tokenizer,
                                        slot_meta=slot_meta,
                                        n_history=args.n_history,
                                        max_seq_length=args.max_seq_length,
                                        op_code=args.op_code)
        torch.save(test_data_raw, test_path)
    else:
        test_data_raw = torch.load(test_path)

    print("# test examples %d" % len(test_data_raw))

    if not os.path.exists(train_path):
        train_data_raw = prepare_dataset(data_path=args.train_data_path,
                                         tokenizer=tokenizer,
                                         slot_meta=slot_meta,
                                         n_history=args.n_history,
                                         max_seq_length=args.max_seq_length,
                                         op_code=args.op_code)

        torch.save(train_data_raw, train_path)
    else:
        train_data_raw = torch.load(train_path)

    train_data = MultiWozDataset(train_data_raw,
                                 tokenizer,
                                 slot_meta,
                                 args.max_seq_length,
                                 rng,
                                 ontology,
                                 args.word_dropout,
                                 args.shuffle_state,
                                 args.shuffle_p, pad_id=tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                                 slot_id=tokenizer.convert_tokens_to_ids(['[SLOT]'])[0],
                                 decoder_teacher_forcing=args.decoder_teacher_forcing,
                                 use_full_slot=args.use_full_slot,
                                 use_dt_only=args.use_dt_only, no_dial=args.no_dial,
                                 use_cls_only=args.use_cls_only)

    print("# train examples %d" % len(train_data_raw))

    if not os.path.exists(dev_path):
        dev_data_raw = prepare_dataset(data_path=args.dev_data_path,
                                       tokenizer=tokenizer,
                                       slot_meta=slot_meta,
                                       n_history=args.n_history,
                                       max_seq_length=args.max_seq_length,
                                       op_code=args.op_code)
        torch.save(dev_data_raw,  dev_path)
    else:
        dev_data_raw = torch.load(dev_path)

    print("# dev examples %d" % len(dev_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    model_config.hidden_dropout_prob = args.hidden_dropout_prob

    type_vocab_size = 4
    dec_config = args
    model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id),
                           op2id['update'],
                           tokenizer.convert_tokens_to_ids(['[MASK]'])[0],
                           tokenizer.convert_tokens_to_ids(['[SEP]'])[0],
                           tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                           tokenizer.convert_tokens_to_ids(['-'])[0],
                           type_vocab_size, args.exclude_domain)

    if not os.path.exists(args.bert_ckpt_path):
        args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets')

    state_dict = torch.load(args.bert_ckpt_path, map_location='cpu')
    _k = 'embeddings.token_type_embeddings.weight'
    print("config.type_vocab_size != state_dict[bert.embeddings.token_type_embeddings.weight] ({0} != {1})".format(
            type_vocab_size, state_dict[_k].shape[0]))
    state_dict[_k].resize_(
        type_vocab_size, state_dict[_k].shape[1])
    state_dict[_k].data[2, :].copy_(state_dict[_k].data[0, :])
    state_dict[_k].data[3, :].copy_(state_dict[_k].data[0, :])
    model.bert.load_state_dict(state_dict)
    print("\n### Done Load BERT")
    sys.stdout.flush()

    # re-initialize added special tokens ([SLOT], [NULL], [EOS])
    model.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)

    # re-initialize seg-2, seg-3
    model.bert.embeddings.token_type_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.token_type_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)
    model.to(device)

    num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs)

    if args.use_one_optim:
        print("### Use One Optim")
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.enc_lr)
        scheduler = WarmupLinearSchedule(optimizer, int(num_train_steps * args.enc_warmup),
                                             t_total=num_train_steps)
    else:
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        enc_param_optimizer = list(model.bert.named_parameters())  # TODO: For BERT only
        print('### Optim BERT: {:}'.format(len(enc_param_optimizer)))
        enc_optimizer_grouped_parameters = [
            {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr)
        enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup),
                                             t_total=num_train_steps)

        dec_param_optimizer = list(model.named_parameters())  # TODO:  For other parameters
        print('### Optim All: {:}'.format(len(dec_param_optimizer)))
        dec_param_optimizer = [p for (n, p) in dec_param_optimizer if 'bert' not in n]
        print('### Optim OTH: {:}'.format(len(dec_param_optimizer)))
        dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr)
        dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup),
                                             t_total=num_train_steps)

    if args.recover_e > 0:
        model_recover, enc_recover, dec_recover = load(args, str(args.recover_e))
        print("### Recover Model E{:}".format(args.recover_e))
        sys.stdout.flush()
        model.load_state_dict(model_recover)
        print("### Recover Optim E{:}".format(args.recover_e))
        sys.stdout.flush()
        enc_optimizer.load_state_dict(enc_recover)
        dec_optimizer.load_state_dict(dec_optimizer)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size,
                                  collate_fn=train_data.collate_fn,
                                  num_workers=args.num_workers,
                                  worker_init_fn=worker_init_fn)

    loss_fnc = nn.CrossEntropyLoss()
    best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0}

    start_time = time.time()

    for epoch in range(args.n_epochs):
        batch_loss = []
        model.train()
        for step, batch in enumerate(train_dataloader):

            batch = [b.to(device) if (not isinstance(b, int)) and (not isinstance(b, dict) and (not isinstance(b, list)) and (not isinstance(b, np.ndarray))) else b for b in batch]

            input_ids_p, segment_ids_p, input_mask_p, \
            state_position_ids, op_ids, domain_ids, input_ids_g, segment_ids_g, position_ids_g, input_mask_g, \
            masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, n_total_pred = batch

            domain_scores, state_scores, loss_g = model(input_ids_p, segment_ids_p, input_mask_p, state_position_ids,
                input_ids_g, segment_ids_g, position_ids_g, input_mask_g,
                masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, only_pred_op=args.only_pred_op, n_gpu=n_gpu)

            if n_total_pred > 0:
                loss_g = loss_g.sum() / n_total_pred
            else:
                loss_g = 0

            loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1))

            if args.only_pred_op:
                loss = loss_s
            else:
                loss = loss_s + loss_g

            if args.exclude_domain is not True:
                loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1))
                loss = loss + loss_d

            batch_loss.append(loss.item())

            loss.backward()

            if args.use_one_optim:
                optimizer.step()
                scheduler.step()
            else:
                enc_optimizer.step()
                enc_scheduler.step()
                dec_optimizer.step()
                dec_scheduler.step()

            model.zero_grad()

            if step % 100 == 0:
                try:
                    loss_g = loss_g.item()
                except AttributeError:
                    loss_g = loss_g

                if args.exclude_domain is not True:
                    print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \
                          % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_s.item(), loss_g, loss_d.item()))
                else:
                    print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \
                          % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_s.item(), loss_g))

                sys.stdout.flush()
                batch_loss = []

        if args.use_one_optim:
            save(args, epoch + 1, model, optimizer)
        else:
            save(args, epoch + 1, model, enc_optimizer, dec_optimizer)

        if ((epoch+1) % args.eval_epoch == 0) and (epoch+1 >= 8):
            eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code,
                                        use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu)
            print("### Epoch {:} Score : ".format(epoch+1), eval_res)

            if eval_res['joint_acc'] > best_score['joint_acc']:
                best_score = eval_res
                print("### Best Joint Acc: {:} ###".format(best_score['joint_acc']))
                print('\n')

                if epoch+1 >= 8:  # To speed up
                    eval_res_test = model_evaluation(model, test_data_raw, tokenizer, slot_meta, epoch + 1, args.op_code,
                                                     use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu)
                    print("### Epoch {:} Test Score : ".format(epoch + 1), eval_res_test)
Ejemplo n.º 26
0
    token_idx = torch.tensor(bert_tokenizer.convert_tokens_to_ids(tokens))
    sep_idx = tokens.index('[SEP]')
    segment_idx = token_idx * 0
    segment_idx[(sep_idx + 1):] = 1
    mask = (token_idx != 0)
    return token_idx.unsqueeze(0), segment_idx.unsqueeze(0), mask.unsqueeze(0)


if __name__ == '__main__':
    args = parser.parse_args()
    assert os.path.exists(args.bert_model), '{} does not exist'.format(args.bert_model)
    assert os.path.exists(args.bert_vocab), '{} does not exist'.format(args.bert_vocab)
    assert args.topk > 0, '{} should be positive'.format(args.topk)

    print('Initialize BERT vocabulary from {}...'.format(args.bert_vocab))
    bert_tokenizer = BertTokenizer(vocab_file=args.bert_vocab)
    print('Initialize BERT model from {}...'.format(args.bert_model))
    config = BertConfig.from_json_file('./bert-base-uncased/config.json')
    bert_model = BertForMaskedLM.from_pretrained('./bert-base-uncased/pytorch_model.bin', config = config)

    while True:
        message = input('Enter your message: ').strip()
        tokens = bert_tokenizer.tokenize(message)
        if len(tokens) == 0:
            continue
        if tokens[0] != CLS:
            tokens = [CLS] + tokens
        if tokens[-1] != SEP:
            tokens.append(SEP)
        token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer)
        with torch.no_grad():
Ejemplo n.º 27
0
def main():
    pred_file_path = 'test.csv'
    load_save_model = True
    lr = 1e-5
    batch_size = 8
    gpu = True
    torch.manual_seed(0)
    device = torch.device('cpu')
    if gpu:
        device = torch.device('cuda')

    tokenizer = BertTokenizer(vocab_file='publish/vocab.txt', max_len=512)
    _, known_token = load_dataset('TRAIN/Train_reviews.csv',
                                  'TRAIN/Train_labels.csv', tokenizer)
    dataset = load_review_dataset('TRAIN/TEST/Test_reviews.csv')
    dataset = Dataset(list(dataset.items()))
    dataloader = torch_data.DataLoader(dataset=dataset,
                                       batch_size=batch_size,
                                       shuffle=False,
                                       collate_fn=test_collate_fn(
                                           tokenizer, known_token))
    bert_pretraining = convert_tf_checkpoint_to_pytorch(
        './publish/bert_model.ckpt', './publish/bert_config.json')
    model = Model(bert_pretraining.bert)

    model = model.cuda()
    if load_save_model:
        model.load_state_dict(torch.load('./save_model/best.model'))

    pred_file = open(pred_file_path, mode='w', encoding='utf-8')

    pbar = tqdm()
    model.eval()
    for step, (batch_X, len_X, mask, batch_idx,
               origin_batch_X) in enumerate(dataloader):
        batch_X = batch_X.to(device)
        mask = mask.to(device)

        scores, gather_idx = model(batch_X, len_X, mask, None)
        (pred_seq_target, pred_match_target, pred_single_aspect_category_target, pred_single_opinion_category_target,\
            pred_cross_category_target, pred_single_aspect_polarity_target, pred_single_opinion_polarity_target,\
                pred_cross_polarity_target) = model.infer(scores, mask)

        label = []

        aspect_idx, opinion_idx = gather_idx
        for b in range(batch_X.shape[0]):
            _aspect_idx, _opinion_idx = aspect_idx[b], opinion_idx[b]
            if len(_aspect_idx) == 0 and len(_opinion_idx) == 0:
                label.append((batch_idx[b], '_', '_', '_', '_'))

            _aspect_cross, _opinion_cross = [
                False for i in range(len(_aspect_idx))
            ], [False for i in range(len(_opinion_idx))]
            for i in range(len(_aspect_idx)):
                for j in range(len(_opinion_idx)):
                    if pred_match_target[b][i, j] == 1:
                        _aspect_cross[i] = True
                        _opinion_cross[j] = True
                        category = ID2CATEGORY[pred_cross_category_target[b][
                            i, j]]
                        polarity = ID2POLARITY[pred_cross_polarity_target[b][
                            i, j]]
                        aspect = tokenizer.decode(
                            list(origin_batch_X[b, _aspect_idx[i]].cpu().
                                 detach().numpy())).replace(' ', '')
                        opinion = tokenizer.decode(
                            list(origin_batch_X[b,
                                                _opinion_idx[j]].cpu().detach(
                                                ).numpy())).replace(' ', '')
                        # aspect = tokenizer.decode(list(batch_X[b, _aspect_idx[i]].cpu().detach().numpy())).replace(' ', '')
                        # opinion = tokenizer.decode(list(batch_X[b, _opinion_idx[j]].cpu().detach().numpy())).replace(' ', '')
                        aspect_beg = len(
                            tokenizer.decode(
                                list(batch_X[b,
                                             1:_aspect_idx[i][0]].cpu().detach(
                                             ).numpy())).replace(' ', ''))
                        aspect_end = aspect_beg + len(aspect)
                        opinion_beg = len(
                            tokenizer.decode(
                                list(batch_X[b, 1:_opinion_idx[j][0]].cpu().
                                     detach().numpy())).replace(' ', ''))
                        opinion_end = opinion_beg + len(opinion)
                        label.append((batch_idx[b], aspect, opinion, category,
                                      polarity))
            for i in range(len(_aspect_idx)):
                if _aspect_cross[i] == False:
                    category = ID2CATEGORY[
                        pred_single_aspect_category_target[b][i]]
                    polarity = ID2POLARITY[
                        pred_single_aspect_polarity_target[b][i]]
                    aspect = tokenizer.decode(
                        list(origin_batch_X[
                            b,
                            _aspect_idx[i]].cpu().detach().numpy())).replace(
                                ' ', '')
                    # aspect = tokenizer.decode(list(batch_X[b, _aspect_idx[i]].cpu().detach().numpy())).replace(' ', '')
                    aspect_beg = len(
                        tokenizer.decode(
                            list(batch_X[b, 1:_aspect_idx[i][0]].cpu().detach(
                            ).numpy())).replace(' ', ''))
                    aspect_end = aspect_beg + len(aspect)
                    label.append(
                        (batch_idx[b], aspect, '_', category, polarity))
            for i in range(len(_opinion_idx)):
                if _opinion_cross[i] == False:
                    category = ID2CATEGORY[
                        pred_single_opinion_category_target[b][i]]
                    polarity = ID2POLARITY[
                        pred_single_opinion_polarity_target[b][i]]
                    opinion = tokenizer.decode(
                        list(origin_batch_X[
                            b,
                            _opinion_idx[i]].cpu().detach().numpy())).replace(
                                ' ', '')
                    # opinion = tokenizer.decode(list(batch_X[b, _opinion_idx[i]].cpu().detach().numpy())).replace(' ', '')
                    opinion_beg = len(
                        tokenizer.decode(
                            list(batch_X[b, 1:_opinion_idx[i][0]].cpu().detach(
                            ).numpy())).replace(' ', ''))
                    opinion_end = opinion_beg + len(opinion)
                    label.append(
                        (batch_idx[b], '_', opinion, category, polarity))

        for _label in label:
            _label = ','.join(list(map(lambda x: str(x), _label)))
            pred_file.write(_label + '\n')
        pbar.update(batch_size)
        pbar.set_description('step: %d' % step)
    pred_file.close()
    pbar.close()
Ejemplo n.º 28
0
def tag_sent(text):
    # initialize variables
    num_tags = 24  # depends on the labelling scheme
    max_len = 45
    vocabulary = "bert_models/vocab.txt"
    bert_out_address = 'bert/model'

    tokenizer = BertTokenizer(vocab_file=vocabulary, do_lower_case=False)

    model = BertForTokenClassification.from_pretrained(bert_out_address,
                                                       num_labels=num_tags)

    f = open('se_data/tags.txt')
    lines = f.readlines()
    f.close()

    tag2idx = {}
    for line in lines:
        key = line.split()[0]
        val = line.split()[1]
        tag2idx[key.strip()] = int(val.strip())

    tag2name = {tag2idx[key]: key for key in tag2idx.keys()}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()

    if torch.cuda.is_available():
        model.cuda()
        if n_gpu > 1:
            model = torch.nn.DataParallel(model)

    model.eval()

    tokenized_texts = []
    word_piece_labels = []
    i_inc = 0

    temp_token = []

    # Add [CLS] at the front
    temp_token.append('[CLS]')

    for word in nltk.word_tokenize(text):
        token_list = tokenizer.tokenize(word)
        for m, token in enumerate(token_list):
            temp_token.append(token)

    # Add [SEP] at the end
    temp_token.append('[SEP]')

    tokenized_texts.append(temp_token)

    #if 5 > i_inc:
    #print("No.%d,len:%d"%(i_inc,len(temp_token)))
    #print("texts:%s"%(" ".join(temp_token)))
    #i_inc +=1

    input_ids = pad_sequences(
        [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
        maxlen=max_len,
        dtype="long",
        truncating="post",
        padding="post")

    attention_masks = [[int(i > 0) for i in ii] for ii in input_ids]
    #attention_masks[0];

    segment_ids = [[0] * len(input_id) for input_id in input_ids]
    #segment_ids[0];

    tr_inputs = torch.tensor(input_ids).to(device)
    tr_masks = torch.tensor(attention_masks).to(device)
    tr_segs = torch.tensor(segment_ids).to(device)

    outputs = model(
        tr_inputs,
        token_type_ids=None,
        attention_mask=tr_masks,
    )

    #tr_masks = tr_masks.to('cpu').numpy()

    logits = outputs[0]

    # Get NER predict result
    logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
    logits = logits.detach().cpu().numpy()

    #print(logits)
    #print(len(logits[0]))
    tags_t = [tag2name[t] for t in logits[0]]

    #print(nltk.word_tokenize(text))
    c = len(tokenized_texts[0])
    #print(tags_t[:c])
    return tokenized_texts[0][1:len(temp_token) -
                              1], tags_t[:c][1:len(tags_t[:c]) - 1]