Example #1
0
 def test_ddt(self, condition, username, password):
     """测试登录"""
     logger.info("username: {}".format(username))
     logger.info("password: {}".format(password))
     self.assertTrue(condition)
     self.assertTrue(username)
     self.assertTrue(password)
Example #2
0
def init_last_folders():
    logger.info("Initialize LastResult and inner dirs")
    if not os.path.exists(LAST_RESULT_DIR):
        os.mkdir(LAST_RESULT_DIR)
    if not os.path.exists(LAST_LOGS_DIR):
        os.mkdir(LAST_LOGS_DIR)
    if not os.path.exists(LAST_REPORTS_DIR):
        os.mkdir(LAST_REPORTS_DIR)
    if not os.path.exists(LAST_SCREENSHOTS_DIR):
        os.mkdir(LAST_SCREENSHOTS_DIR)
Example #3
0
def init_results_folders():
    logger.info("Initialize Results and inner folders")
    if not os.path.exists(RESULTS_DIR):
        os.mkdir(RESULTS_DIR)
    if not os.path.exists(RESULTS_LOGS_DIR):
        os.mkdir(RESULTS_LOGS_DIR)
    if not os.path.exists(RESULTS_REPORTS_DIR):
        os.mkdir(RESULTS_REPORTS_DIR)
    if not os.path.exists(RESULTS_SCREENSHOTS_DIR):
        os.mkdir(RESULTS_SCREENSHOTS_DIR)
Example #4
0
    def convert_examples_to_features(self,examples, tokenizer, max_seq_length):
        features = []
        '''
        对每一个例子进行处理
        '''
        for example_index, example in enumerate(examples):

            utterance_tokens = tokenizer.tokenize(example.text_a)
            response_tokens = tokenizer.tokenize(example.text_b)
            choices_features = []
            segment_maxlen = (max_seq_length-4)//2
            self._truncate_seq_pair(utterance_tokens, response_tokens, segment_maxlen)

            utterance_inputids = tokenizer.convert_tokens_to_ids(["[CLS]"] + utterance_tokens + ["[SEP]"])
            response_inputids = tokenizer.convert_tokens_to_ids(["[CLS]"] + response_tokens + ["[SEP]"])

            utterance_padding = segment_maxlen - len(utterance_tokens)
            response_padding = segment_maxlen - len(response_tokens)

            input_ids = utterance_inputids+[0]*utterance_padding+response_inputids+[0]*response_padding
            # print(len(input_ids),len(utterance_inputids),utterance_padding,len(response_inputids),response_padding)
            segment_ids = [1]*(segment_maxlen+2)+[0]*(segment_maxlen+2)
            input_mask = [1]*(len(utterance_inputids))+[0]*utterance_padding+[1]*len(response_inputids)+[0]*response_padding
            utterance_mask = [1]*(segment_maxlen+2)+[0]*(segment_maxlen+2)
            response_mask = [0]*(segment_maxlen+2)+[1]*(segment_maxlen+2)

            padding_length = max_seq_length - len(input_ids)
            input_ids += ([0] * padding_length)
            input_mask += ([0] * padding_length)
            segment_ids += ([0] * padding_length)
            utterance_mask += ([0] * padding_length)
            response_mask += ([0] * padding_length)

            # print(len(input_ids),len(segment_ids),len(input_mask),len(utterance_mask),len(response_mask))
            assert len(input_ids) ==len(segment_ids) ==len(input_mask) ==len(utterance_mask) == len(response_mask)

            choices_features.append((utterance_tokens+response_tokens, input_ids, input_mask, segment_ids,utterance_mask,response_mask))

            label = example.label

            if example_index < 3:
                logger.info("*** Example ***")
                logger.info("idx: {}".format(example_index))
                logger.info("guid: {}".format(example.guid))
                logger.info("tokens: {}".format(' '.join(utterance_tokens+response_tokens).replace('\u2581', '_')))
                logger.info("label: {}".format(label))
            features.append(
                InputFeatures(
                    example_id=example.guid,
                    choices_features=choices_features,
                    label=label
                )
            )
        return features
Example #5
0
def init_backup_folders():
    logger.info("Initialize BackupResult and inner dirs")
    if not os.path.exists(BACKUP_RESULT_DIR):
        os.mkdir(BACKUP_RESULT_DIR)
    if not os.path.exists(backup_date_dir):
        os.mkdir(backup_date_dir)
    if not os.path.exists(backup_logs_dir):
        os.mkdir(backup_logs_dir)
    if not os.path.exists(backup_reports_dir):
        os.mkdir(backup_reports_dir)
    if not os.path.exists(backup_screenshots_dir):
        os.mkdir(backup_screenshots_dir)
Example #6
0
def backup_results(file_type, results_folder_path, target_folder_path):
    for root, dirs, files in os.walk(results_folder_path):
        for file in files:
            if file.split(".")[1] == file_type:
                file_path = os.path.join(results_folder_path, file)
                target_path = os.path.join(target_folder_path, file)
                # 如果有相同文件则覆盖
                if os.path.exists(target_path):
                    os.remove(target_path)
                shutil.copy(file_path, target_path)
    logger.info("Backup {0}".format(results_folder_path))
    shutil.rmtree(results_folder_path, ignore_errors=True)
Example #7
0
def init_backup_folders():
    logger.info("Initialize Backup and inner folders")
    if not os.path.exists(BACKUP_DIR):
        os.mkdir(BACKUP_DIR)
    if not os.path.exists(backup_date_path):
        os.mkdir(backup_date_path)
    if not os.path.exists(backup_logs_path):
        os.mkdir(backup_logs_path)
    if not os.path.exists(backup_reports_path):
        os.mkdir(backup_reports_path)
    if not os.path.exists(backup_screenshots_path):
        os.mkdir(backup_screenshots_path)
Example #8
0
    def convert_examples_to_features(self, examples, tokenizer,
                                     max_seq_length):
        features = []
        '''
        对每一个例子进行处理
        '''
        for example_index, example in enumerate(examples):

            context_tokens = tokenizer.tokenize(example.text_a)
            ending_tokens = tokenizer.tokenize(example.text_b)

            choices_features = []
            context_tokens, ending_tokens = self._truncate_seq_pair(
                context_tokens, ending_tokens, max_seq_length - 2)
            # self.truncature(context_tokens, ending_tokens, max_seq_length)
            # tokens = ["[CLS]"] + ending_tokens + ["[SEP]"] + context_tokens + ["[SEP]"]
            utt_tokens = ["[CLS]"] + context_tokens + ["[SEP]"]
            resp_tokens = ["[CLS]"] + ending_tokens + ["[SEP]"]
            # segment_ids = [0] * (len(ending_tokens) + 2) + [1] * (len(context_tokens) + 1)
            # input_ids = tokenizer.convert_tokens_to_ids(tokens)
            utt_input_ids = tokenizer.convert_tokens_to_ids(utt_tokens)
            resp_input_ids = tokenizer.convert_tokens_to_ids(resp_tokens)
            # input_mask = [1] * len(input_ids)
            utt_input_mask = [1] * len(utt_input_ids)
            resp_input_mask = [1] * len(resp_input_ids)

            utt_padding_length = max_seq_length - len(utt_input_ids)
            resp_padding_length = max_seq_length - len(resp_input_ids)
            utt_input_ids += ([0] * utt_padding_length)
            resp_input_ids += ([0] * resp_padding_length)
            utt_input_mask += ([0] * utt_padding_length)
            resp_input_mask += ([0] * resp_padding_length)
            # segment_ids += ([0] * padding_length)
            # choices_features.append((tokens, input_ids, input_mask, segment_ids))
            choices_features.append(
                (utt_tokens, utt_input_ids, utt_input_mask, resp_tokens,
                 resp_input_ids, resp_input_mask))
            label = example.label
            if example_index < 3:
                logger.info("*** Example ***")
                logger.info("idx: {}".format(example_index))
                logger.info("guid: {}".format(example.guid))
                logger.info("tokens: {}".format(
                    ' '.join(utt_tokens + resp_tokens).replace('\u2581', '_')))
                logger.info("label: {}".format(label))
            features.append(
                InputFeatures(example_id=example.guid,
                              choices_features=choices_features,
                              label=label))
        return features
Example #9
0
def html_test_runner(test_suite,
                     report_title=HTML_REPORT_TITLE,
                     report_description=HTML_REPORT_DESCRIPTION,
                     tester=HTML_REPORT_TESTER):
    now_time = time.strftime("%Y-%m-%d_%H-%M-%S")
    report_name = "{0}_HtmlTestReport.html".format(now_time)
    full_report_name = os.path.join(LAST_REPORTS_DIR, report_name)
    with open(full_report_name, 'wb') as f:
        runner = HTMLTestRunner(stream=f,
                                title=report_title,
                                description=report_description,
                                tester=tester)
        logger.info("Start to test...\n")
        runner.run(test_suite)
        logger.info("Finish testing...")
Example #10
0
 def on_call(*args, **kwargs):
     # 失败重跑次数
     if rerun is False:
         rerun_time = 1
     elif isinstance(rerun, int):
         rerun_time = rerun
     else:
         rerun_time = 3
     # _browser是获取测试用例实例的browser属性,因为跨越了xxxPage属性层,所以用到了循环
     _testcase_name = args[0]._testMethodName
     _testclass_name = args[0].__class__.__name__
     _browser = None
     for attr in dir(args[0]):
         if hasattr(getattr(args[0], attr), 'browser'):
             _browser = getattr(getattr(args[0], attr), 'browser')
             break
     # 循环执行测试用例
     _rerun_time = rerun_time
     while rerun_time > 0:
         try:
             logger.info(
                 (' TestRunNo: >> {0} '.format(_rerun_time -
                                               rerun_time + 1)).center(
                                                   100, '-'))
             result = func(*args, **kwargs)
             # 用例执行完毕抛出所有可能存在的AssertionError异常
             args[0].raise_exc()
             logger.info(' TestResult: '.center(100, '-'))
             logger.info('[TestSuccess]: {0} >> {1} '.format(
                 _testclass_name, _testcase_name))
             return result
         except Exception:
             if screenshot:
                 _filename = 'Error_' + _testcase_name
                 _browser.take_screenshot(_filename)
             rerun_time -= 1
             if rerun_time == 0:
                 exc_type, exc_msg, _ = sys.exc_info()
                 logger.info(' TestResult: '.center(100, '-'))
                 logger.error('[TestFail]: {0}: {1}'.format(
                     exc_type.__name__, exc_msg))
                 raise
 def launch_remote_browser(self, _command_executor, _desired_capabilities,
                           implicity_wait_timeout=IMPLICITY_WAIT_TIME):
     """
     启动远程浏览器
     :param _command_executor: 远程server地址,如 "http://192.168.98.106:5555/wd/hub"
     :param _desired_capabilities: 调用webdriver的DesiredCapabilities的模板
     :param implicity_wait_timeout: 隐式等待时间
     :return: self.driver
     """
     logger.info("Launch remote browser")
     try:
         self.driver = webdriver.Remote(command_executor=_command_executor,
                                        desired_capabilities=_desired_capabilities)
         logger.info("Maximize browser")
         self.driver.maximize_window()
         logger.info("Set implicity wait time to {0}".format(str(implicity_wait_timeout)))
         self.driver.implicitly_wait(implicity_wait_timeout)
         return self.driver
     except WebDriverException as e:
         logger.error("Fail to launch browser: {0}".format(str(e)))
         raise e
     except Exception:
         logger.exception("Fail to launch browser", exc_info=True)
         raise
Example #12
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # Prepare model
        config = BertConfig.from_pretrained(self.model_name_or_path,
                                            num_labels=3)
        model = BertForSequenceClassification.from_pretrained(
            self.model_name_or_path, self.args, config=config)
        model.to(self.device)

        data_splitList = DATABDCI.load_data(os.path.join(
            self.data_dir, 'train.csv'),
                                            n_splits=5)
        for split_index, each_data in enumerate(data_splitList):
            logger.info(f'Fold {split_index + 1}')
            train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader(
                each_data)

            num_train_optimization_steps = self.train_steps

            # Prepare optimizer

            param_optimizer = list(model.named_parameters())
            param_optimizer = [n for n in param_optimizer]

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                self.weight_decay
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=self.learning_rate,
                              eps=self.adam_epsilon)
            scheduler = WarmupLinearSchedule(optimizer,
                                             warmup_steps=self.warmup_steps,
                                             t_total=self.train_steps)

            global_step = 0

            logger.info("***** Running training *****")
            logger.info("  Num examples = %d", len(train_examples))
            logger.info("  Batch size = %d", self.train_batch_size)
            logger.info("  Num steps = %d", num_train_optimization_steps)

            best_acc = 0
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            train_dataloader = cycle(train_dataloader)

            for step in range(num_train_optimization_steps):
                batch = next(train_dataloader)
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids=input_ids,
                             token_type_ids=segment_ids,
                             attention_mask=input_mask,
                             labels=label_ids)
                tr_loss += loss.item()
                train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                loss.backward()

                if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if (step + 1) % (self.eval_steps *
                                 self.gradient_accumulation_steps) == 0:
                    tr_loss = 0
                    nb_tr_examples, nb_tr_steps = 0, 0
                    logger.info("***** Report result *****")
                    logger.info("  %s = %s", 'global_step', str(global_step))
                    logger.info("  %s = %s", 'train loss', str(train_loss))

                if self.do_eval and (
                        step + 1) % (self.eval_steps *
                                     self.gradient_accumulation_steps) == 0:
                    for file in ['dev.csv']:
                        inference_labels = []
                        gold_labels = []
                        inference_logits = []

                        logger.info("***** Running evaluation *****")
                        logger.info("  Num examples = %d", len(eval_examples))
                        logger.info("  Batch size = %d", self.eval_batch_size)

                        # Run prediction for full data

                        model.eval()
                        eval_loss, eval_accuracy = 0, 0
                        nb_eval_steps, nb_eval_examples = 0, 0
                        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                            input_ids = input_ids.to(self.device)
                            input_mask = input_mask.to(self.device)
                            segment_ids = segment_ids.to(self.device)
                            label_ids = label_ids.to(self.device)

                            with torch.no_grad():
                                tmp_eval_loss = model(
                                    input_ids=input_ids,
                                    token_type_ids=segment_ids,
                                    attention_mask=input_mask,
                                    labels=label_ids)
                                logits = model(input_ids=input_ids,
                                               token_type_ids=segment_ids,
                                               attention_mask=input_mask)

                            logits = logits.detach().cpu().numpy()
                            label_ids = label_ids.to('cpu').numpy()
                            inference_labels.append(np.argmax(logits, axis=1))
                            gold_labels.append(label_ids)
                            inference_logits.append(logits)
                            eval_loss += tmp_eval_loss.mean().item()
                            nb_eval_examples += input_ids.size(0)
                            nb_eval_steps += 1

                        gold_labels = np.concatenate(gold_labels, 0)
                        inference_logits = np.concatenate(inference_logits, 0)
                        model.train()
                        eval_loss = eval_loss / nb_eval_steps
                        eval_accuracy = accuracyBDCI(inference_logits,
                                                     gold_labels)

                        result = {
                            'eval_loss': eval_loss,
                            'eval_F1': eval_accuracy,
                            'global_step': global_step,
                            'loss': train_loss
                        }

                        output_eval_file = os.path.join(
                            self.output_dir, "eval_results.txt")
                        with open(output_eval_file, "a") as writer:
                            for key in sorted(result.keys()):
                                logger.info("  %s = %s", key, str(result[key]))
                                writer.write("%s = %s\n" %
                                             (key, str(result[key])))
                            writer.write('*' * 80)
                            writer.write('\n')
                        if eval_accuracy > best_acc and 'dev' in file:
                            print("=" * 80)
                            print("Best F1", eval_accuracy)
                            print("Saving Model......")
                            best_acc = eval_accuracy
                            # Save a trained model
                            model_to_save = model.module if hasattr(
                                model, 'module') else model
                            output_model_file = os.path.join(
                                self.output_dir,
                                "pytorch_model_{}.bin".format(split_index))
                            torch.save(model_to_save.state_dict(),
                                       output_model_file)
                            print("=" * 80)
                        else:
                            print("=" * 80)
        if self.do_test:
            del model
            gc.collect()
            self.do_train = False
            data = DATABDCI(debug=False,
                            data_dir='/home/lsy2018/文本匹配/DATA/DATA_BDCI/',
                            data_process_output=
                            '/home/lsy2018/文本匹配/DATA/DATA_BDCI/data_1014/')
            model = BertForSequenceClassification.from_pretrained(
                os.path.join(self.output_dir, "pytorch_model.bin"),
                self.args,
                config=config)
            model.to(self.device)

            for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]:
                inference_labels = []
                gold_labels = []
                eval_examples = data.read_examples(os.path.join(
                    self.data_dir, file),
                                                   is_training=False)
                print('exa', len(eval_examples))
                eval_features = data.convert_examples_to_features(
                    eval_examples, self.tokenizer, self.max_seq_length)
                all_input_ids = torch.tensor(data.select_field(
                    eval_features, 'input_ids'),
                                             dtype=torch.long)
                all_input_mask = torch.tensor(data.select_field(
                    eval_features, 'input_mask'),
                                              dtype=torch.long)
                all_segment_ids = torch.tensor(data.select_field(
                    eval_features, 'segment_ids'),
                                               dtype=torch.long)
                all_label = torch.tensor([f.label for f in eval_features],
                                         dtype=torch.long)

                eval_data = TensorDataset(all_input_ids, all_input_mask,
                                          all_segment_ids, all_label)
                # Run prediction for full data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data,
                                             sampler=eval_sampler,
                                             batch_size=self.eval_batch_size)

                model.eval()
                eval_loss, eval_accuracy = 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                    input_ids = input_ids.to(self.device)
                    input_mask = input_mask.to(self.device)
                    segment_ids = segment_ids.to(self.device)
                    label_ids = label_ids.to(self.device)

                    with torch.no_grad():
                        logits = model(
                            input_ids=input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask).detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    inference_labels.append(logits)
                    gold_labels.append(label_ids)
                gold_labels = np.concatenate(gold_labels, 0)
                logits = np.concatenate(inference_labels, 0)
                if flag == 'dev':
                    print(flag, accuracyBDCI(logits, gold_labels))
                if flag == 'test':
                    df = pd.read_csv(os.path.join(self.data_dir, file),
                                     names=['id', 'content', 'title', 'label'])
                    predict = np.argmax(logits, axis=1).tolist()
                    print(df.shape[0])
                    print(len(predict))
                    df['labelpre'] = predict
                    df[['id', 'labelpre'
                        ]].to_csv(os.path.join(self.output_dir, "sub.csv"),
                                  index=False,
                                  header=False)
    def convert_examples_to_features(self, examples, tokenizer,
                                     max_seq_length):
        features = []
        '''
        对每一个例子进行处理
        '''
        for example_index, example in enumerate(examples):

            # eachturn_tokens = tokenizer.tokenize(example.text_eachturn)
            eachturn_tokens = example.text_history.split(' ')
            eachturn_domainslot_token = self.domain_slot_token
            eachturn_label_tokens_start = example.label_tokens_start
            eachturn_label_tokens_end = example.label_tokens_end
            eachturn_label_sentence_domainslot = example.label_sentence_domainslot
            eachturn_label_tokens_domainslot = example.label_tokens_domainslot
            # for each in eachturn_label_tokens_domainslot:
            #     print(len(each),each)
            # exit()
            choices_features = []
            total_length = len(eachturn_tokens)
            max_seq_length_ = max_seq_length - 3 - len(self.domain_slot_token)
            if total_length > max_seq_length_:
                eachturn_tokens = eachturn_tokens[-max_seq_length_:]
                eachturn_label_tokens_start = eachturn_label_tokens_start[
                    -max_seq_length_:]
                eachturn_label_tokens_end = eachturn_label_tokens_end[
                    -max_seq_length_:]
                eachturn_label_tokens_domainslot = eachturn_label_tokens_domainslot[
                    -max_seq_length_:]
            tokens = ["[CLS]"] + eachturn_tokens + ["[SEP]"]
            eachturn_label_tokens_start = [0] + eachturn_label_tokens_start + [
                0
            ]
            eachturn_label_tokens_end = [0] + eachturn_label_tokens_end + [0]
            eachturn_label_tokens_domainslot.insert(
                0, [0] * len(eachturn_label_sentence_domainslot))
            eachturn_label_tokens_domainslot.append(
                [0] * len(eachturn_label_sentence_domainslot))
            # print(len(tokens))
            # exit()
            padding_length = max_seq_length - 1 - len(
                self.domain_slot_token) - len(tokens)

            eachturn_label_tokens_start += [0] * padding_length
            eachturn_label_tokens_end += [0] * padding_length
            for i in range(padding_length):
                eachturn_label_tokens_domainslot.append(
                    [0] * len(eachturn_label_sentence_domainslot))

            tokens_input_ids = tokenizer.convert_tokens_to_ids(tokens)
            domainslot_input_ids = tokenizer.convert_tokens_to_ids(
                eachturn_domainslot_token + ["[SEP]"])

            assert len(eachturn_label_tokens_domainslot) == len(
                eachturn_label_tokens_start) == len(eachturn_label_tokens_end)
            assert len(eachturn_label_tokens_domainslot[0]) == len(
                eachturn_label_sentence_domainslot)

            # input_ids = tokenizer.convert_tokens_to_ids(tokens+ eachturn_domainslot_token + ["[SEP]"])
            input_ids = tokens_input_ids + [
                0
            ] * padding_length + domainslot_input_ids
            segment_ids = [0] * len(tokens_input_ids) + [
                0
            ] * padding_length + [1] * (len(eachturn_domainslot_token) + 1)
            input_mask = [1] * len(tokens_input_ids) + [0] * padding_length + [
                0
            ] * (len(eachturn_domainslot_token) + 1)
            utterance_mask = [1] * (len(tokens_input_ids) + padding_length) + [
                0
            ] * (len(eachturn_domainslot_token) + 1)
            domainslot_mask = [0] * (len(tokens_input_ids) +
                                     padding_length) + [1] * (
                                         len(eachturn_domainslot_token)) + [0]

            assert len(input_ids) == len(input_mask) == len(
                segment_ids) == max_seq_length
            assert len(input_ids) == len(utterance_mask) == len(
                domainslot_mask)

            # print(len(input_ids))

            choices_features.append(
                (tokens, input_ids, input_mask, segment_ids, utterance_mask,
                 domainslot_mask))
            if example_index < 3:
                logger.info("*** Example ***")
                logger.info("idx: {}".format(example_index))
                logger.info("guid: {}".format(example.guid))
                logger.info("tokens: {}".format(' '.join(tokens).replace(
                    '\u2581', '_')))
                logger.info("utterance_mask: {}".format(utterance_mask))
                logger.info("domainslot_mask: {}".format(domainslot_mask))
                logger.info(('turn_belief:{}'.format(example.turn_belief)))
                logger.info("eachturn_label_tokens_start: {}".format(
                    eachturn_label_tokens_start))
                logger.info("eachturn_label_tokens_end: {}".format(
                    eachturn_label_tokens_end))
                # logger.info("eachturn_label_tokens_domainslot: {}".format(eachturn_label_tokens_domainslot))
                logger.info("eachturn_label_sentence_domainslot:{}".format(
                    eachturn_label_sentence_domainslot))
            features.append(
                InputFeatures(
                    example_id=example.guid,
                    hist_token=tokens + eachturn_domainslot_token + ["[SEP]"],
                    choices_features=choices_features,
                    label_tokens_start=eachturn_label_tokens_start,
                    label_tokens_end=eachturn_label_tokens_end,
                    label_tokens_domainslot=eachturn_label_tokens_domainslot,
                    label_sentence_domainslot=
                    eachturn_label_sentence_domainslot,
                ))
        return features
Example #14
0
    def train(self):
        trainset, train_dataloader, testset, test_dataloader = self.create_dataloader(
        )
        model = Classifier(1, 256, trainset.num_classes).to(self.device)
        loss_func = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        model.train()
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(trainset.ids))
        logger.info("  Batch size = %d", self.train_batchsize)
        logger.info("  Num steps = %d", self.train_steps)

        global_step, nb_tr_steps, tr_loss = 0, 0, 0
        best_MRR = 0
        train_dataloader = cycle(train_dataloader)

        for each_step in range(self.train_steps):
            bg, label = next(train_dataloader)
            prediction = model(bg)
            loss = loss_func(prediction, label)
            tr_loss += loss.item()
            train_loss = round(tr_loss / (nb_tr_steps + 1), 4)
            nb_tr_steps += 1

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            global_step += 1

            if (each_step + 1) % (self.eval_steps) == 0:
                tr_loss = 0
                nb_tr_steps = 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))
                logger.info("***** Running evaluation *****")
                logger.info("  Num examples = %d", len(testset.ids))
                logger.info("  Batch size = %d", self.eval_batchsize)

                scores = []
                labels = []
                ids = testset.ids
                model.eval()

                for iter, (bg, label) in enumerate(test_dataloader):
                    with torch.no_grad():
                        logits = model(bg).detach().cpu().numpy()
                    label = label.detach().cpu().numpy()
                    scores.append(logits)
                    labels.append(label)
                scores = np.concatenate(scores, 0)
                labels = np.concatenate(labels, 0)
                model.train()

                assert len(ids) == len(scores) == len(labels)
                eval_DOUBAN_MRR, eval_DOUBAN_mrr, eval_DOUBAN_MAP, eval_Precision1 = compute_DOUBAN(
                    ids, scores, labels)
                print('eval_MRR', eval_DOUBAN_MRR, eval_DOUBAN_mrr, 'eval_MAP',
                      eval_DOUBAN_MAP, 'eval_Precision1', eval_Precision1)
                result = {
                    'eval_MRR': eval_DOUBAN_MRR,
                    'eval_MAP': eval_DOUBAN_MAP,
                    'eval_Precision1': eval_Precision1,
                    'global_step': global_step,
                    'loss': train_loss
                }
                output_eval_file = os.path.join(self.output_dir,
                                                "eval_results.txt")
                with open(output_eval_file, "a") as writer:
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))
                    writer.write('*' * 80)
                    writer.write('\n')
                if eval_DOUBAN_MRR > best_MRR:
                    print("=" * 80)
                    print("Best MRR", eval_DOUBAN_MRR)
                    print("Saving Model......")
                    best_MRR = eval_DOUBAN_MRR
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    output_model_file = os.path.join(self.output_dir,
                                                     "pytorch_model.bin")
                    torch.save(model_to_save.state_dict(), output_model_file)
                    print("=" * 80)
                else:
                    print("=" * 80)
Example #15
0
    def convert_examples_to_features(self,examples, tokenizer, max_seq_length,max_history_length = 192):
        features = []
        '''
        对每一个例子进行处理
        '''
        for example_index, example in enumerate(examples):
            try:
                history_tokens = tokenizer.tokenize(example.text_his)            
                utterance_tokens = tokenizer.tokenize(example.text_utt)     
                response_tokens = tokenizer.tokenize(example.text_resp)
            except:
                print(example.text_his,example.text_utt,example.text_resp,'\n')
            choices_features = []
            segment_maxlen = (max_seq_length-4)//2
            self._truncate_seq_pair(
                utterance_tokens,
                response_tokens,
                history_tokens,
                segment_maxlen,
                max_history_length)

            history_inputids = tokenizer.convert_tokens_to_ids(["[CLS]"] + history_tokens + ["[SEP]"])
            utterance_inputids = tokenizer.convert_tokens_to_ids(["[CLS]"] + utterance_tokens + ["[SEP]"])
            response_inputids = tokenizer.convert_tokens_to_ids(["[CLS]"] + response_tokens + ["[SEP]"])

            history_padding = max_history_length-2 - len(history_tokens)
            utterance_padding = segment_maxlen - len(utterance_tokens)
            response_padding = segment_maxlen - len(response_tokens)

            input_ids = utterance_inputids+[0]*utterance_padding+response_inputids+[0]*response_padding+history_inputids+[0]*history_padding
            # print(
            #     len(input_ids),
            #     len(utterance_inputids),
            #     utterance_padding,
            #     len(response_inputids),
            #     response_padding,
            #     len(history_inputids),
            #     history_padding)
            segment_ids = [1]*(segment_maxlen+2)+[0]*(segment_maxlen+2)+[1]*(max_history_length)
            input_mask = [1]*(len(utterance_inputids))+[0]*utterance_padding+[1]*len(response_inputids)+[0]*response_padding+[1]*len(history_inputids)+[0]*history_padding

            history_mask = [0]*(segment_maxlen+2+segment_maxlen+2)+[1]*max_history_length
            utterance_mask = [1]*(segment_maxlen+2)+[0]*(segment_maxlen+2+max_history_length)
            response_mask = [0]*(segment_maxlen+2)+[1]*(segment_maxlen+2)+[0]*max_history_length

            # padding_length = max_seq_length - len(input_ids)
            # input_ids += ([0] * padding_length)
            # input_mask += ([0] * padding_length)
            # segment_ids += ([0] * padding_length)
            # utterance_mask += ([0] * padding_length)
            # response_mask += ([0] * padding_length)
            # history_mask += ([0]*padding_length)

            # print(len(input_ids),len(segment_ids),len(input_mask),len(utterance_mask),len(response_mask),len(history_mask))
            graph_tokens=["[CLS]"]+utterance_tokens+["[SEP]"]+["[PAD]"]*utterance_padding+\
                         ["[CLS]"]+response_tokens+["[SEP]"]+["[PAD]"]*response_padding+\
                         ["[CLS]"]+history_tokens+["[SEP]"]+["[PAD]"]*history_padding
            assert len(graph_tokens) == len(input_ids)
            # print(graph_tokens)

            row = []
            col = []
            weight = []




            assert len(input_ids) ==len(segment_ids) ==len(input_mask) ==len(utterance_mask) == len(response_mask) == len(history_mask)

            choices_features.append((utterance_tokens+response_tokens, input_ids, input_mask, segment_ids,utterance_mask,response_mask,history_mask))

            label = example.label

            if example_index < 3:
                logger.info("*** Example ***")
                logger.info("idx: {}".format(example_index))
                logger.info("guid: {}".format(example.guid))
                logger.info("tokens: {}".format(' '.join(utterance_tokens+response_tokens).replace('\u2581', '_')))
                logger.info("label: {}".format(label))
            features.append(
                InputFeatures(
                    example_id=example.guid,
                    choices_features=choices_features,
                    label=label
                )
            )
        # exit()
        return features
Example #16
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)


        # logger.info(f'Fold {split_index + 1}')
        train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader()

        num_train_optimization_steps = self.train_steps

        # Prepare model
        config = BertConfig.from_pretrained(self.model_name_or_path)
        model = BertForTokenClassification.from_pretrained(self.model_name_or_path,self.args, config=config)
        model.to(self.device)
        model.train()
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay': self.weight_decay},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", self.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        best_MRR = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        train_dataloader = cycle(train_dataloader)

        for step in range(num_train_optimization_steps):
            batch = next(train_dataloader)
            batch = tuple(t.to(self.device) for t in batch)
            input_ids,input_mask,segment_ids,\
            utterance_mask,domain_mask, \
            slot_mask,hist_mask,\
            label_value_start,label_value_end,\
            label_domainslot = batch

            loss_tokenstart,loss_tokenend,loss_domainslot = model(
                input_ids=input_ids,
                token_type_ids=segment_ids,
                attention_mask=input_mask,
                utterance_mask = utterance_mask,
                domain_mask = domain_mask,
                slot_mask = slot_mask,
                hist_mask = hist_mask,
                label_value_start=label_value_start,
                label_value_end = label_value_end,
                label_domainslot = label_domainslot
            )
            loss = loss_tokenstart + loss_tokenend + loss_domainslot
            # loss = loss_domainslot
            tr_loss += loss.item()
            train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            loss.backward()
            if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1

            if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                for file in ['de.csv']:
                    gold_value_start = []
                    gold_value_end = []
                    gold_domainslot = []
                    scores_value_start = []
                    scores_value_end = []
                    scores_domainslot = []
                    dialogueID = [x.guid for x in eval_examples]
                    utterance_text = [x.text_eachturn for x in eval_examples]
                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", self.eval_batch_size)

                    model.eval()
                    eval_loss_tokens_start,eval_loss_tokens_end,eval_loss_domainslot = 0,0,0
                    eval_F1_tokens_start,eval_F1_tokens_end = 0,0
                    eval_F1_sentence_domainslot,eval_F1_tokens_domainslot = 0,0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids,input_mask, segment_ids,\
                        utterance_mask,domain_mask, \
                        slot_mask,hist_mask,\
                        label_value_start,label_value_end,\
                        label_domainslot in eval_dataloader:
                        input_ids = input_ids.to(self.device)
                        input_mask = input_mask.to(self.device)
                        segment_ids = segment_ids.to(self.device)
                        utterance_mask = utterance_mask.to(self.device)
                        domain_mask = domain_mask.to(self.device)
                        slot_mask = slot_mask.to(self.device)
                        hist_mask = hist_mask.to(self.device)
                        label_value_start = label_value_start.to(self.device)
                        label_value_end = label_value_end.to(self.device)
                        label_domainslot = label_domainslot.to(self.device)


                        with torch.no_grad():
                            batch_eval_loss_value_start,batch_eval_loss_value_end,batch_eval_loss_domainslot = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                utterance_mask = utterance_mask,
                                domain_mask = domain_mask,
                                slot_mask = slot_mask,
                                hist_mask = hist_mask,
                                label_value_start = label_value_start,
                                label_value_end=label_value_end,
                                label_domainslot=label_domainslot
                            )
                            logits_value_start,logits_value_end,logits_domainslot = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                utterance_mask = utterance_mask,
                                domain_mask = domain_mask,
                                slot_mask = slot_mask,
                                hist_mask = hist_mask,
                            )
                        logits_value_start = logits_value_start.cpu().numpy()
                        logits_value_end = logits_value_end.cpu().numpy()
                        logits_domainslot = logits_domainslot.cpu().numpy()

                        label_value_start = label_value_start.to('cpu').numpy()
                        label_value_end = label_value_end.to('cpu').numpy()
                        label_domainslot = label_domainslot.to('cpu').numpy()

                        scores_value_start.append(logits_value_start)
                        scores_value_end.append(logits_value_end)
                        scores_domainslot.append(logits_domainslot)

                        gold_value_start.append(label_value_start)
                        gold_value_end.append(label_value_end)
                        gold_domainslot.append(label_domainslot)

                        eval_loss_tokens_start += batch_eval_loss_value_start.mean().item()
                        eval_loss_tokens_end += batch_eval_loss_value_end.mean().item()
                        eval_loss_domainslot += batch_eval_loss_domainslot.mean().item()

                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_value_start = np.concatenate(gold_value_start,0)
                    gold_value_end = np.concatenate(gold_value_end,0)
                    gold_domainslot = np.concatenate(gold_domainslot,0)

                    scores_value_start = np.concatenate(scores_value_start, 0)
                    scores_value_end = np.concatenate(scores_value_end, 0)
                    scores_domainslot = np.concatenate(scores_domainslot,0)

                    model.train()
                    eval_loss_tokens_start = eval_loss_tokens_start/nb_eval_steps
                    eval_loss_tokens_end = eval_loss_tokens_end / nb_eval_steps
                    eval_loss_domainslot = eval_loss_domainslot /nb_eval_steps

                    # print(scores_domainslot.shape)
                    # print(gold_labels_domainslot.shape)
                    # print(scores_domainslot)
                    # print(gold_labels_domainslot)
                    # exit()
                    # eval_accuracy_token_start = accuracyF1(scores_domain, gold_labels_domain,mode='domain')
                    # eval_accuracy_token_end = accuracyF1(scores_dependcy, gold_labels_dependcy ,mode= 'dependcy')

                    eval_F1_valuestart,eval_F1_valueend,F1_domainslot = compute_jointGoal_domainslot(
                        dialogueID,
                        utterance_text,
                        scores_value_start,
                        scores_value_end,
                        scores_domainslot,
                        gold_value_start,
                        gold_value_end,
                        gold_domainslot
                    )


                    print(
                        'F1_domainslot',F1_domainslot,
                        'eval_F1_valuestart',eval_F1_valuestart,
                        'eval_F1_valueend', eval_F1_valueend,
                        'global_step',global_step,
                        'loss',train_loss
                    )
                    result = {

                        'eval_loss_tokens_start':eval_loss_tokens_start,
                        'eval_loss_tokens_end': eval_loss_tokens_end,
                        'eval_loss_domainslot':eval_loss_domainslot,

                        'F1_domainslot': F1_domainslot,
                        'eval_F1_valuestart': eval_F1_valuestart,
                        'eval_F1_valueend': eval_F1_valueend,
                        'global_step': global_step,
                        'loss': train_loss}

                    output_eval_file = os.path.join(self.output_dir, "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_F1_valuestart > best_acc :
                        print("=" * 80)
                        print("Best jointGoal", eval_F1_valuestart)
                        print("Saving Model......")
                        # best_acc = eval_accuracy
                        best_acc = eval_F1_valuestart
                        # Save a trained model
                        model_to_save = model.module if hasattr(model,'module') else model
                        output_model_file = os.path.join(self.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(), output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
Example #17
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        data_splitList = DATACQA.load_data(os.path.join(self.data_dir, 'train.csv'),n_splits=5)
        for split_index,each_data in enumerate(data_splitList):
            # Prepare model
            config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels)
            model = BertForSequenceClassification.from_pretrained(self.model_name_or_path, self.args, config=config)
            model.to(self.device)

            logger.info(f'Fold {split_index + 1}')
            train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader(each_data)

            num_train_optimization_steps = self.train_steps

            # Prepare optimizer

            param_optimizer = list(model.named_parameters())
            param_optimizer = [n for n in param_optimizer]

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                 'weight_decay': self.weight_decay},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

            optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
            scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps)

            global_step = 0

            logger.info("***** Running training *****")
            logger.info("  Num examples = %d", len(train_examples))
            logger.info("  Batch size = %d", self.train_batch_size)
            logger.info("  Num steps = %d", num_train_optimization_steps)

            best_acc = 0
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            train_dataloader = cycle(train_dataloader)

            for step in range(num_train_optimization_steps):
                batch = next(train_dataloader)
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
                tr_loss += loss.item()
                train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                loss.backward()

                if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                    tr_loss = 0
                    nb_tr_examples, nb_tr_steps = 0, 0
                    logger.info("***** Report result *****")
                    logger.info("  %s = %s", 'global_step', str(global_step))
                    logger.info("  %s = %s", 'train loss', str(train_loss))

                if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                    for file in ['dev.csv']:
                        inference_labels = []
                        gold_labels = []
                        inference_logits = []
                        scores = []
                        questions = [x.text_a for x in eval_examples]

                        logger.info("***** Running evaluation *****")
                        logger.info("  Num examples = %d", len(eval_examples))
                        logger.info("  Batch size = %d", self.eval_batch_size)

                        # Run prediction for full data

                        model.eval()
                        eval_loss, eval_accuracy = 0, 0
                        nb_eval_steps, nb_eval_examples = 0, 0
                        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                            input_ids = input_ids.to(self.device)
                            input_mask = input_mask.to(self.device)
                            segment_ids = segment_ids.to(self.device)
                            label_ids = label_ids.to(self.device)

                            with torch.no_grad():
                                tmp_eval_loss = model(
                                    input_ids=input_ids,
                                    token_type_ids=segment_ids,
                                    attention_mask=input_mask,
                                    labels=label_ids)
                                logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                            logits = logits.detach().cpu().numpy()
                            label_ids = label_ids.to('cpu').numpy()
                            inference_labels.append(np.argmax(logits, axis=1))
                            scores.append(logits)
                            gold_labels.append(label_ids)
                            inference_logits.append(logits)
                            eval_loss += tmp_eval_loss.mean().item()
                            nb_eval_examples += input_ids.size(0)
                            nb_eval_steps += 1

                        gold_labels = np.concatenate(gold_labels, 0)
                        inference_logits = np.concatenate(inference_logits, 0)
                        scores = np.concatenate(scores, 0)
                        model.train()
                        eval_loss = eval_loss / nb_eval_steps
                        eval_accuracy = accuracyCQA(inference_logits, gold_labels)
                        eval_mrr = compute_MRR_CQA(scores,gold_labels,questions)
                        eval_5R20 = compute_5R20(scores,gold_labels,questions)

                        result = {'eval_loss': eval_loss,
                                  'eval_F1': eval_accuracy,
                                  'eval_MRR':eval_mrr,
                                  'eval_5R20':eval_5R20,
                                  'global_step': global_step,
                                  'loss': train_loss}

                        output_eval_file = os.path.join(self.output_dir, "eval_results.txt")
                        with open(output_eval_file, "a") as writer:
                            for key in sorted(result.keys()):
                                logger.info("  %s = %s", key, str(result[key]))
                                writer.write("%s = %s\n" % (key, str(result[key])))
                            writer.write('*' * 80)
                            writer.write('\n')
                        if eval_accuracy > best_acc :
                            print("=" * 80)
                            print("Best F1", eval_accuracy)
                            print("Saving Model......")
                            best_acc = eval_accuracy
                            # Save a trained model
                            model_to_save = model.module if hasattr(model,'module') else model
                            output_model_file = os.path.join(self.output_dir, "pytorch_model_{}.bin".format(split_index))
                            torch.save(model_to_save.state_dict(), output_model_file)
                            print("=" * 80)
                        else:
                            print("=" * 80)

            del model
            gc.collect()
 def setUp(self):
     logger.info(' {0} >> {1} '.format(self.__class__.__name__,
                                       self._testMethodName).center(
                                           100, '*'))
     self.homePage.browser.navigate_to('https://www.utest.com/')
     self.homePage.sign_in_button().click()
 def launch_local_browser(self, local_browser_name=BROWSER_NAME, window_size=BROWSER_WINDOW_SIZE,
                          implicity_wait_timeout=IMPLICITY_WAIT_TIME):
     """启动本地浏览器"""
     try:
         # 初始化浏览器
         if BROWSER_NAME in ["Chrome", "chrome", "CHORME"]:
             logger.info("Launch {0} browser".format(local_browser_name))
             self.driver = webdriver.Chrome(CHROME_DRIVER_PATH)
         elif BROWSER_NAME in ["Firefox", "firefox", "FIREFOX", "FireFox"]:
             logger.info("Launch {0} browser".format(local_browser_name))
             self.driver = webdriver.Firefox()
         elif BROWSER_NAME in ["Ie", "ie", "IE"]:
             logger.info("Launch {0} browser".format(local_browser_name))
             self.driver = webdriver.Ie(IE_DRIVER_PATH)
         else:
             raise NameError
         # 设定浏览器尺寸
         if window_size in ["Max", "max", "MAX"]:
             logger.info("Maximize browser")
             self.driver.maximize_window()
         elif window_size in ["Min", "min", "MIN"]:
             logger.info("Minimize browser")
             self.driver.minimize_window()
         # 设定隐式等待时间
         logger.info("Set implicity wait time to {0}".format(str(implicity_wait_timeout)))
         self.driver.implicitly_wait(implicity_wait_timeout)
         return self.driver
     except NameError:
         logger.error("Fail to launch browser due to incorrect browser name: {0}".format(BROWSER_NAME))
         raise
     except WebDriverException as e:
         logger.error("Fail to launch browser: {0}".format(e))
         raise e
     except Exception:
         logger.exception("Fail to launch browser", exc_info=True)
         raise
 def quit_browser(self):
     logger.info("Quit browser and release current driver")
     self.driver.quit()
Example #21
0
    def train(self):


        try:
            os.makedirs(args.output_dir)
        except:
            pass

        tokenizer = BertTokenizer.from_pretrained(self.model_name_or_path, do_lower_case=self.do_lower_case)
        config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=3)

        # Prepare model
        model = BertForSequenceClassification.from_pretrained(self.model_name_or_path, args, config=config)
        model.to(self.device)

        train_batch_size = self.per_gpu_train_batch_size
        eval_batch_size = self.per_gpu_eval_batch_size
        for i in range(1):

            # Prepare data loader

            train_examples = self.read_examples(os.path.join(self.data_dir, 'train.csv'), is_training=True)
            train_features = self.convert_examples_to_features(
                train_examples, tokenizer, self.max_seq_length)
            all_input_ids = torch.tensor(self.select_field(train_features, 'input_ids'), dtype=torch.long)
            all_input_mask = torch.tensor(self.select_field(train_features, 'input_mask'), dtype=torch.long)
            all_segment_ids = torch.tensor(self.select_field(train_features, 'segment_ids'), dtype=torch.long)
            all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
            train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)

            train_sampler = RandomSampler(train_data)
            train_dataloader = DataLoader(train_data, sampler=train_sampler,batch_size=train_batch_size )

            num_train_optimization_steps = self.train_steps

            # Prepare optimizer

            param_optimizer = list(model.named_parameters())

            # hack to remove pooler, which is not used
            # thus it produce None grad that break apex
            param_optimizer = [n for n in param_optimizer]

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                 'weight_decay': args.weight_decay},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

            optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
            scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps)

            global_step = 0

            logger.info("***** Running training *****")
            logger.info("  Num examples = %d", len(train_examples))
            logger.info("  Batch size = %d", train_batch_size)
            logger.info("  Num steps = %d", num_train_optimization_steps)

            best_acc = 0
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            train_dataloader = cycle(train_dataloader)

            for step in range(num_train_optimization_steps):
                batch = next(train_dataloader)
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
                tr_loss += loss.item()
                train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                loss.backward()

                if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                    tr_loss = 0
                    nb_tr_examples, nb_tr_steps = 0, 0
                    logger.info("***** Report result *****")
                    logger.info("  %s = %s", 'global_step', str(global_step))
                    logger.info("  %s = %s", 'train loss', str(train_loss))

                if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                    for file in ['dev.csv']:
                        inference_labels = []
                        gold_labels = []
                        inference_logits = []
                        eval_examples = self.read_examples(os.path.join(self.data_dir, file), is_training=True)
                        eval_features = self.convert_examples_to_features(eval_examples, tokenizer, self.max_seq_length)
                        all_input_ids = torch.tensor(self.select_field(eval_features, 'input_ids'), dtype=torch.long)
                        all_input_mask = torch.tensor(self.select_field(eval_features, 'input_mask'), dtype=torch.long)
                        all_segment_ids = torch.tensor(self.select_field(eval_features, 'segment_ids'), dtype=torch.long)
                        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)

                        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)

                        logger.info("***** Running evaluation *****")
                        logger.info("  Num examples = %d", len(eval_examples))
                        logger.info("  Batch size = %d", eval_batch_size)

                        # Run prediction for full data
                        eval_sampler = SequentialSampler(eval_data)
                        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

                        model.eval()
                        eval_loss, eval_accuracy = 0, 0
                        nb_eval_steps, nb_eval_examples = 0, 0
                        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                            input_ids = input_ids.to(self.device)
                            input_mask = input_mask.to(self.device)
                            segment_ids = segment_ids.to(self.device)
                            label_ids = label_ids.to(self.device)

                            with torch.no_grad():
                                tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids,
                                                      attention_mask=input_mask, labels=label_ids)
                                logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                            logits = logits.detach().cpu().numpy()
                            label_ids = label_ids.to('cpu').numpy()
                            inference_labels.append(np.argmax(logits, axis=1))
                            gold_labels.append(label_ids)
                            inference_logits.append(logits)
                            eval_loss += tmp_eval_loss.mean().item()
                            nb_eval_examples += input_ids.size(0)
                            nb_eval_steps += 1

                        gold_labels = np.concatenate(gold_labels, 0)
                        inference_logits = np.concatenate(inference_logits, 0)
                        model.train()
                        eval_loss = eval_loss / nb_eval_steps
                        eval_accuracy = self.accuracy(inference_logits, gold_labels)

                        result = {'eval_loss': eval_loss,
                                  'eval_F1': eval_accuracy,
                                  'global_step': global_step,
                                  'loss': train_loss}

                        output_eval_file = os.path.join(self.output_dir, "eval_results.txt")
                        with open(output_eval_file, "a") as writer:
                            for key in sorted(result.keys()):
                                logger.info("  %s = %s", key, str(result[key]))
                                writer.write("%s = %s\n" % (key, str(result[key])))
                            writer.write('*' * 80)
                            writer.write('\n')
                        if eval_accuracy > best_acc and 'dev' in file:
                            print("=" * 80)
                            print("Best F1", eval_accuracy)
                            print("Saving Model......")
                            best_acc = eval_accuracy
                            # Save a trained model
                            model_to_save = model.module if hasattr(model,
                                                                    'module') else model  # Only save the model it-self
                            output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
                            torch.save(model_to_save.state_dict(), output_model_file)
                            print("=" * 80)
                        else:
                            print("=" * 80)
        if args.do_test:
            del model
            gc.collect()
            args.do_train = False
            model = BertForSequenceClassification.from_pretrained(os.path.join(args.output_dir, "pytorch_model.bin"), args,
                                                                  config=config)
            if args.fp16:
                model.half()
            model.to(self.device)
            if args.local_rank != -1:
                try:
                    from apex.parallel import DistributedDataParallel as DDP
                except ImportError:
                    raise ImportError(
                        "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

                model = DDP(model)
            elif args.n_gpu > 1:
                model = torch.nn.DataParallel(model)

            for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]:
            # for file, flag in [ ('test.csv', 'test')]:
                inference_labels = []
                gold_labels = []
                eval_examples = self.read_examples(os.path.join(args.data_dir, file), is_training=False)
                print('exa',len(eval_examples))
                # exit()
                eval_features = self.convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length)
                all_input_ids = torch.tensor(self.select_field(eval_features, 'input_ids'), dtype=torch.long)
                all_input_mask = torch.tensor(self.select_field(eval_features, 'input_mask'), dtype=torch.long)
                all_segment_ids = torch.tensor(self.select_field(eval_features, 'segment_ids'), dtype=torch.long)
                all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)

                eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
                # Run prediction for full data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

                model.eval()
                eval_loss, eval_accuracy = 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                    input_ids = input_ids.to(self.device)
                    input_mask = input_mask.to(self.device)
                    segment_ids = segment_ids.to(self.device)
                    label_ids = label_ids.to(self.device)

                    with torch.no_grad():
                        logits = model(input_ids=input_ids, token_type_ids=segment_ids,
                                       attention_mask=input_mask).detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    inference_labels.append(logits)
                    gold_labels.append(label_ids)
                gold_labels = np.concatenate(gold_labels, 0)
                logits = np.concatenate(inference_labels, 0)
                if flag == 'dev':
                    print(flag,self.accuracy(logits, gold_labels))
                if flag == 'test':
                    df = pd.read_csv(os.path.join(args.data_dir, file),names = ['id', 'content', 'title', 'label'])
                    predict = np.argmax(logits, axis=1).tolist()
                    print(df.shape[0])
                    print(len(predict))
                    df['labelpre'] = predict
                    df[['id','labelpre']].to_csv(os.path.join(args.output_dir, "sub.csv"),index=False,header = False)
Example #22
0
    def convert_examples_to_features(self,examples, tokenizer, max_seq_length):
        features = []
        '''
        对每一个例子进行处理
        '''
        for example_index, example in enumerate(examples):

            # eachturn_tokens = tokenizer.tokenize(example.text_eachturn)
            eachturn_histokens = example.text_history.split(' ')
            eachturn_utttokens = example.text_eachturn.split(' ')
            eachturn_domain = example.domain
            eachturn_slot = example.slot
            eachturn_value_start= example.label_value_start
            eachturn_value_end = example.label_value_end
            eachturn_domainslot = example.label_domainslot

            choices_features = []
            # self._truncate_seq_pair(
            #     eachturn_utttokens,
            #     eachturn_histokens,
            #     max_seq_length - 6)
            utt_length = (max_seq_length - 6)//2
            hist_length = (max_seq_length - 6) // 2
            if len(eachturn_utttokens) > utt_length:
                eachturn_utttokens = eachturn_utttokens[-utt_length:]
            utt_tokens = ["[CLS]"] + eachturn_utttokens + ["[SEP]"]
            utt_inputids_ = tokenizer.convert_tokens_to_ids(utt_tokens)
            utt_padding_length = utt_length+2-len(utt_inputids_)
            utt_inputids = utt_inputids_ + [0]*utt_padding_length
            utt_segmentid = [1]*len(utt_inputids)

            domain_slot_tokens = [eachturn_domain] +[eachturn_slot]+ ["[SEP]"]
            domain_slot_inputids = tokenizer.convert_tokens_to_ids(domain_slot_tokens)

            if len(eachturn_histokens) > hist_length:
                eachturn_histokens = eachturn_histokens[-utt_length:]
            hist_tokens = eachturn_histokens + ["[SEP]"]
            hist_inputids = tokenizer.convert_tokens_to_ids(hist_tokens)
            hist_padding_length = hist_length+1-len(hist_inputids)
            hist_inputids += [0]*hist_padding_length
            hist_segmentid = [0]*len(hist_inputids)

            tokens = utt_tokens + domain_slot_tokens + hist_tokens
            input_ids = utt_inputids + domain_slot_inputids + hist_inputids
            segment_ids = utt_segmentid + [0]+[0]+[0]+hist_segmentid
            input_mask = [1] * len(input_ids)

            utt_mask = [1]*len(utt_inputids_)+[0]*(utt_padding_length + len(domain_slot_inputids)+len(hist_inputids))
            domains_mask = [0]*len(utt_inputids)+[1]+[0]+[0]+[0]*len(hist_inputids)
            slot_mask = [0]*len(utt_inputids)+[0]+[1]+[0]+[0]*len(hist_inputids)
            hist_mask = [0]*len(utt_inputids)+[0]+[0]+[0]+[1]*len(hist_inputids)
            label_value_start = [0]+eachturn_value_start+[0]
            label_value_end = [0]+eachturn_value_end+[0]

            padding_length_domainslot = max_seq_length - len(label_value_start)
            label_value_start += ([0] * padding_length_domainslot)
            label_value_end += ([0] * padding_length_domainslot)
            assert len(input_ids)==len(utt_mask)==len(domains_mask)==len(slot_mask)==len(hist_mask)
            assert len(label_value_start) == len(label_value_end) == len(input_ids)




            choices_features.append((
                tokens, input_ids, input_mask, segment_ids,
                utt_mask,domains_mask,slot_mask,hist_mask
            ))


            if example_index < 3:
                logger.info("*** Example ***")
                logger.info("idx: {}".format(example_index))
                logger.info("guid: {}".format(example.guid))
                logger.info("tokens: {}".format(' '.join(tokens).replace('\u2581', '_')))
                logger.info("label_value_start: {}".format(label_value_start))
                logger.info("label_value_end: {}".format(label_value_end))
                logger.info("eachturn_domainslot: {}".format(eachturn_domainslot))

            features.append(
                InputFeatures(
                    example_id=example.guid,
                    choices_features=choices_features,
                    label_value_start=label_value_start,
                    label_value_end = label_value_end,
                    label_domainslot = eachturn_domainslot
                )
            )
        return features
    def convert_examples_to_features(self, examples, tokenizer,
                                     max_seq_length):
        features = []
        '''
        对每一个例子进行处理
        '''
        for example_index, example in enumerate(examples):

            context_tokens = tokenizer.tokenize(example.text_a)
            ending_tokens = tokenizer.tokenize(example.text_b)

            choices_features = []
            self.truncature(context_tokens, ending_tokens, max_seq_length)
            tokens_utt = ["[CLS]"] + context_tokens + ["[SEP]"]
            tokens_resp = ["[CLS]"] + ending_tokens + ["[SEP]"]
            segment_id_utt = [1] * (len(tokens_utt))
            input_id_utt = tokenizer.convert_tokens_to_ids(tokens_utt)
            input_mask_utt = [1] * (len(tokens_utt))
            segment_id_resp = [1] * (len(tokens_resp))
            input_id_resp = tokenizer.convert_tokens_to_ids(tokens_resp)
            input_mask_resp = [1] * (len(tokens_resp))

            padding_length_utt = max_seq_length - len(input_id_utt)
            padding_length_resp = max_seq_length - len(input_id_resp)

            input_id_utt += ([0] * padding_length_utt)
            input_mask_utt += ([0] * padding_length_utt)
            segment_id_utt += ([0] * padding_length_utt)
            input_id_resp += ([0] * padding_length_resp)
            input_mask_resp += ([0] * padding_length_resp)
            segment_id_resp += ([0] * padding_length_resp)
            assert len(input_id_utt) == len(input_mask_utt) == len(
                segment_id_utt) == max_seq_length
            assert len(input_id_resp) == len(input_mask_resp) == len(
                segment_id_resp) == max_seq_length
            choices_features.append(
                (tokens_utt, input_id_utt, input_mask_utt, segment_id_utt,
                 tokens_resp, input_id_resp, input_mask_resp, segment_id_resp))
            label = example.label
            if example_index < 3:
                logger.info("*** Example ***")
                logger.info("idx: {}".format(example_index))
                logger.info("guid: {}".format(example.guid))
                logger.info("tokensUtt: {}".format(
                    ' '.join(tokens_utt).replace('\u2581', '_')))
                logger.info("tokensRESP: {}".format(
                    ' '.join(tokens_resp).replace('\u2581', '_')))
                logger.info("label: {}".format(label))
            features.append(
                InputFeatures(example_id=example.guid,
                              choices_features=choices_features,
                              label=label))
        return features
Example #24
0
def init_test_result_folders():
    logger.info("Initialize TestResult dir")
    if not os.path.exists(TEST_RESULT_DIR):
        os.mkdir(TEST_RESULT_DIR)
Example #25
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # logger.info(f'Fold {split_index + 1}')
        train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader(
        )

        num_train_optimization_steps = self.train_steps

        # Prepare model
        config = BertConfig.from_pretrained(self.model_name_or_path)
        model = BertForTokenClassification.from_pretrained(
            self.model_name_or_path, self.args, config=config)
        model.to(self.device)
        model.train()
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            self.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.learning_rate,
                          eps=self.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=self.warmup_steps,
                                         t_total=self.train_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", self.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        best_MRR = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        train_dataloader = cycle(train_dataloader)

        for step in range(num_train_optimization_steps):
            batch = next(train_dataloader)
            batch = tuple(t.to(self.device) for t in batch)
            input_ids, input_mask, segment_ids, label_domain, label_dependcy = batch

            loss_domain, loss_dependcy = model(input_ids=input_ids,
                                               token_type_ids=segment_ids,
                                               attention_mask=input_mask,
                                               label_domain=label_domain,
                                               label_dependcy=label_dependcy)
            loss = loss_domain + loss_dependcy
            tr_loss += loss.item()
            train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            loss.backward()
            if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1

            if (step + 1) % (self.eval_steps *
                             self.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if self.do_eval and (step + 1) % (
                    self.eval_steps * self.gradient_accumulation_steps) == 0:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels_domain = []
                    gold_labels_dependcy = []
                    inference_logits = []
                    scores_domain = []
                    scores_dependcy = []
                    ID = [x.guid for x in eval_examples]

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", self.eval_batch_size)

                    model.eval()
                    eval_loss_domain, eval_loss_dependcy, eval_accuracy_domain, eval_accuracy_dependcy = 0, 0, 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, label_domain, label_dependcy in eval_dataloader:
                        input_ids = input_ids.to(self.device)
                        input_mask = input_mask.to(self.device)
                        segment_ids = segment_ids.to(self.device)
                        label_domain = label_domain.to(self.device)
                        label_dependcy = label_dependcy.to(self.device)

                        with torch.no_grad():
                            batch_eval_loss_domain, batch_eval_loss_dependcy = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                label_domain=label_domain,
                                label_dependcy=label_dependcy)
                            logits_domain, logits_dependcy = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask)

                        logits_domain = logits_domain.view(
                            -1, self.num_labels_domain).detach().cpu().numpy()
                        logits_dependcy = logits_dependcy.view(
                            -1,
                            self.num_labels_dependcy).detach().cpu().numpy()

                        label_domain = label_domain.view(-1).to('cpu').numpy()
                        label_dependcy = label_dependcy.view(-1).to(
                            'cpu').numpy()

                        scores_domain.append(logits_domain)
                        scores_dependcy.append(logits_dependcy)

                        gold_labels_domain.append(label_domain)
                        gold_labels_dependcy.append(label_dependcy)

                        eval_loss_domain += batch_eval_loss_domain.mean().item(
                        )
                        eval_loss_dependcy += batch_eval_loss_dependcy.mean(
                        ).item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels_domain = np.concatenate(gold_labels_domain, 0)
                    gold_labels_dependcy = np.concatenate(
                        gold_labels_dependcy, 0)
                    scores_domain = np.concatenate(scores_domain, 0)
                    scores_dependcy = np.concatenate(scores_dependcy, 0)
                    model.train()
                    eval_loss_domain = eval_loss_domain / nb_eval_steps
                    eval_loss_dependcy = eval_loss_dependcy / nb_eval_steps

                    eval_accuracy_domain = accuracyF1(scores_domain,
                                                      gold_labels_domain,
                                                      mode='domain')
                    eval_accuracy_dependcy = accuracyF1(scores_dependcy,
                                                        gold_labels_dependcy,
                                                        mode='dependcy')
                    print('eval_F1_domain', eval_accuracy_domain,
                          'eval_F1_dependcy', eval_accuracy_dependcy,
                          'global_step', global_step, 'loss', train_loss)
                    result = {
                        'eval_loss_domain': eval_loss_domain,
                        'eval_loss_dependcy': eval_loss_dependcy,
                        'eval_F1_domain': eval_accuracy_domain,
                        'eval_F1_dependcy': eval_accuracy_dependcy,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(self.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy_domain > best_acc:
                        print("=" * 80)
                        print("Best F1", eval_accuracy_domain)
                        print("Saving Model......")
                        # best_acc = eval_accuracy
                        best_acc = eval_accuracy_domain
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = os.path.join(
                            self.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
Example #26
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from Utils.Paths import CHROME_DRIVER_PATH
from Utils.Logger import logger
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time

logger.info('测试开始')

# 实例浏览器对象,并打开浏览器
logger.info('初始化浏览器')
driver = webdriver.Chrome(CHROME_DRIVER_PATH)
driver.implicitly_wait(5)

try:
    # logger.info('最大化浏览器')
    # browser.maximize_window()
    logger.info('前往 https://www.utest.com/articles')
    driver.get('https://www.utest.com/articles')
    time.sleep(2)

    # 这里有个问题,网页会根据浏览器当前尺寸而决定左侧导航栏是否展开。为了兼容展开和不展开2种情况,
    # 必须要在代码里做判断:判断依据是body里面的class属性如果navigator是关闭的,body的class属性值
    # 是loading-indicator-enabled,否则是loading-indicator-enabled nav-menu-open。所以用
    #  try...except...语句来判断是可行的

    logger.info('验证导航栏是否展开')
    try:
        driver.find_element_by_css_selector('.nav-menu-open')
Example #27
0
    def convert_examples_to_features(self, examples, tokenizer,
                                     max_seq_length):
        features = []
        '''
        对每一个例子进行处理
        '''
        for example_index, example in enumerate(examples):

            # eachturn_tokens = tokenizer.tokenize(example.text_eachturn)
            eachturn_tokens = example.text_history.split(' ')
            eachturn_labels_domainslot = example.label_domainslot
            eachturn_labels_domain = example.label_domain
            eachturn_labels_dependcy = example.label_dependcy

            choices_features = []
            total_length = len(eachturn_tokens)
            if total_length > max_seq_length - 2:
                eachturn_tokens = eachturn_tokens[:-max_seq_length]
                eachturn_labels_domain = eachturn_labels_domain[:
                                                                -max_seq_length]
                eachturn_labels_dependcy = eachturn_labels_dependcy[:
                                                                    -max_seq_length]

            tokens = ["[CLS]"] + eachturn_tokens + ["[SEP]"]
            # print(type(eachturn_labels_domain))
            # print(eachturn_labels_domain)
            labels_domain = [0] + eachturn_labels_domain + [0]
            labels_dependcy = [0] + eachturn_labels_dependcy + [0]

            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            segment_ids = [0] * len(input_ids)
            input_mask = [1] * len(input_ids)

            padding_length = max_seq_length - len(input_ids)
            input_ids += ([0] * padding_length)
            input_mask += ([0] * padding_length)
            segment_ids += ([0] * padding_length)
            labels_domain += ([0] * padding_length)
            labels_dependcy += ([0] * padding_length)

            assert len(input_ids) == len(input_mask) == len(segment_ids)
            assert len(input_ids) == len(labels_domain) == len(labels_dependcy)

            choices_features.append(
                (tokens, input_ids, input_mask, segment_ids))
            if example_index < 3:
                logger.info("*** Example ***")
                logger.info("idx: {}".format(example_index))
                logger.info("guid: {}".format(example.guid))
                logger.info("tokens: {}".format(' '.join(tokens).replace(
                    '\u2581', '_')))
                logger.info(('turn_belief:{}'.format(example.turn_belief)))
                logger.info("labels_domainslot: {}".format(
                    example.label_domainslot))
                logger.info("labels_domain: {}".format(labels_domain))
                logger.info("labels_dependcy: {}".format(labels_dependcy))
            features.append(
                InputFeatures(example_id=example.guid,
                              choices_features=choices_features,
                              labels_domainslot=example.label_domainslot,
                              labels_dependcy=labels_dependcy,
                              labels_domain=labels_domain))
        return features
 def tearDown(self):
     logger.info('*' * 100 + '\n')
Example #29
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # logger.info(f'Fold {split_index + 1}')
        train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader(
        )

        num_train_optimization_steps = self.train_steps

        # Prepare model
        config = BertConfig.from_pretrained(self.model_name_or_path,
                                            num_labels=self.num_labels)
        model = BertForSequenceClassification.from_pretrained(
            self.model_name_or_path, self.args, config=config)
        model.to(self.device)
        model.train()
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            self.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.learning_rate,
                          eps=self.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=self.warmup_steps,
                                         t_total=self.train_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", self.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        best_MRR = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        train_dataloader = cycle(train_dataloader)

        for step in range(num_train_optimization_steps):
            batch = next(train_dataloader)
            batch = tuple(t.to(self.device) for t in batch)
            input_ids, input_mask, segment_ids, utterance_mask, response_mask, history_mask, label_ids = batch
            loss = model(input_ids=input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         utterance_mask=utterance_mask,
                         response_mask=response_mask,
                         history_mask=history_mask,
                         labels=label_ids)
            tr_loss += loss.item()
            train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            loss.backward()
            if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1

            if (step + 1) % (self.eval_steps *
                             self.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if self.do_eval and (step + 1) % (
                    self.eval_steps * self.gradient_accumulation_steps) == 0:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []
                    scores = []
                    ID = [x.guid for x in eval_examples]

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", self.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, utterance_mask, response_mask, history_mask, label_ids in eval_dataloader:
                        input_ids = input_ids.to(self.device)
                        input_mask = input_mask.to(self.device)
                        segment_ids = segment_ids.to(self.device)
                        utterance_mask = utterance_mask.to(self.device)
                        response_mask = response_mask.to(self.device)
                        history_mask = history_mask.to(self.device)
                        label_ids = label_ids.to(self.device)

                        with torch.no_grad():
                            tmp_eval_loss = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                utterance_mask=utterance_mask,
                                response_mask=response_mask,
                                history_mask=history_mask,
                                labels=label_ids)
                            logits = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                utterance_mask=utterance_mask,
                                response_mask=response_mask,
                                history_mask=history_mask,
                            )

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        scores.append(logits)
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    scores = np.concatenate(scores, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracyCQA(inference_logits, gold_labels)
                    eval_DOUBAN_MRR, eval_DOUBAN_mrr, eval_DOUBAN_MAP, eval_Precision1 = compute_DOUBAN(
                        ID, scores, gold_labels)
                    r_at_1 = r_at_k(ID, scores, gold_labels, 1)
                    r_at_2 = r_at_k(ID, scores, gold_labels, 2)
                    r_at_5 = r_at_k(ID, scores, gold_labels, 5)
                    # print('eval_mrr',eval_mrr)
                    print('eval_F1', eval_accuracy, 'eval_MRR',
                          eval_DOUBAN_MRR, 'eval_MAP', eval_DOUBAN_MAP,
                          'eval_Precision1', eval_Precision1, 'r10@1', r_at_1,
                          'r10@2', r_at_2, 'r10@5', r_at_5, 'global_step',
                          global_step, 'loss', train_loss)
                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'eval_MRR': eval_DOUBAN_MRR,
                        'eval_MAP': eval_DOUBAN_MAP,
                        'eval_Precision1': eval_Precision1,
                        'r10@1': r_at_1,
                        'r10@2': r_at_2,
                        'r10@5': r_at_5,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(self.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    # if eval_accuracy > best_acc :
                    if eval_DOUBAN_MRR > best_MRR:
                        print("=" * 80)
                        print("Best MRR", eval_DOUBAN_MRR)
                        print("Saving Model......")
                        # best_acc = eval_accuracy
                        best_MRR = eval_DOUBAN_MRR
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = os.path.join(
                            self.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
Example #30
0
def text_test_runner(test_suite):
    runner = unittest.TextTestRunner()
    logger.info("Start to test...\n")
    runner.run(test_suite)
    logger.info("Finish testing...")