Ejemplo n.º 1
0
    def _evaluate_for_train_valid(self):
        """Evaluate model on train and valid set and get acc and f1 score.

        Returns:
            train_acc, train_f1, valid_acc, valid_f1
        """
        train_predictions, train_length = evaluate(
            model=self.model,
            data_loader=self.data_loader['valid_train'],
            device=self.device)
        valid_predictions, valid_length = evaluate(
            model=self.model,
            data_loader=self.data_loader['valid_valid'],
            device=self.device)

        train_answers = handy_tool(
            self.data_loader['train_label'],
            train_length)  #get_labels_from_file(self.config.train_file_path)
        valid_answers = handy_tool(
            self.data_loader['valid_label'],
            valid_length)  #get_labels_from_file(self.config.valid_file_path)
        train_predictions, valid_predictions = self.flatten(
            train_predictions), self.flatten(valid_predictions)
        train_answers, valid_answers = self.flatten(
            train_answers), self.flatten(valid_answers)
        train_acc, train_f1 = calculate_accuracy_f1(train_answers,
                                                    train_predictions)
        valid_acc, valid_f1 = calculate_accuracy_f1(valid_answers,
                                                    valid_predictions)
        return train_acc, train_f1, valid_acc, valid_f1
Ejemplo n.º 2
0
    def bert_classification(self, content):
        logger.info('1:{}'.format(content))
        row = {'content': content}
        df = pandas.DataFrame().append(row, ignore_index=True)
        filename = "data/{}.csv".format(time.time())
        df.to_csv(filename, index=False, columns=['content'])
        test_set, sc_list, label_list = self.data.load_file(filename,
                                                            train=False)

        token_list = []
        for line in sc_list:
            tokens = self.data.tokenizer.convert_ids_to_tokens(line)
            token_list.append(tokens)

        data_loader_test = DataLoader(test_set,
                                      batch_size=self.config.batch_size,
                                      shuffle=False)
        # Evaluate
        answer_list, length_list = evaluate(self.model,
                                            data_loader_test,
                                            self.device,
                                            isTest=True)
        mod_tokens_list = handy_tool(token_list, length_list)
        result = [
            result_to_json(t, s) for t, s in zip(mod_tokens_list, answer_list)
        ]
        entities = [item['entities'] for item in result]
        entities = self.flatten(entities)
        amount_entities = [
            entity['word'] for entity in entities if entity['type'] == 'bms'
        ]

        return {"answer": amount_entities}
Ejemplo n.º 3
0
    def bert_classification(self, content):
        logger.info('1:{}'.format(content))
        # row = {'type1': '/', 'title': title, 'content': content}
        # df = pandas.DataFrame().append(row, ignore_index=True)
        filename = "data/{}.csv".format(time.time())
        lines = self.split(content)
        items = [{"text":line} for line in lines]
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(items, f, ensure_ascii=False, indent=4)
        # df.to_csv(filename, index=False, columns=['type1', 'title', 'content'])
        test_set, sc_list, label_list = self.data.load_file(filename, train=False)

        data_loader_test = DataLoader(
            test_set, batch_size=self.config.batch_size, shuffle=False)
        # Evaluate
        answer_list, length_list = evaluate(self.model, data_loader_test, self.device, isTest=True)

        token_list = []
        for line in sc_list:
            tokens = self.data.tokenizer.convert_ids_to_tokens(line)
            token_list.append(tokens)

        mod_tokens_list = handy_tool(token_list, length_list)
        result = [result_to_json(t, s) for t, s in zip(mod_tokens_list, answer_list)]
        entity_list = []
        for item in result:
            entities = item['entities']
            words = [d['word'] +"-"+all_type_dic[d['type']] for d in entities if d['type'] !='s']
            entity_list.extend(words)

        return {"answer": entity_list}
Ejemplo n.º 4
0
    def bert_classification(self, content):
        logger.info('1:{}'.format(content))
        lines = self.split(content)
        rows = []
        for line in lines:
            rows.append({'content': line})
        df = pandas.DataFrame(rows)
        filename = "data/{}.csv".format(time.time())
        df.to_csv(filename, index=False, columns=['content'])
        test_set, sc_list, label_list, row_list = self.data.load_file(
            filename, train=False)

        # token_list = []
        # for line in sc_list:
        #     tokens = self.data.tokenizer.convert_ids_to_tokens(line)
        #     token_list.append(tokens)

        data_loader_test = DataLoader(test_set,
                                      batch_size=self.config.batch_size,
                                      shuffle=False)
        # Evaluate
        answer_list, length_list = evaluate(self.model,
                                            data_loader_test,
                                            self.device,
                                            isTest=True)
        mod_tokens_list = handy_tool(row_list, length_list)
        result = [
            result_to_json(t, s) for t, s in zip(mod_tokens_list, answer_list)
        ]
        entities = [item['entities'] for item in result]
        entities = self.flatten(entities)

        return {"data": entities}
Ejemplo n.º 5
0
    def bert_classification(self, content):
        logger.info('1:{}'.format(content))
        lines = self.split(content)
        rows = []
        for i, line in enumerate(lines):
            rows.append({"id": i, 'text': line})

        filename = "log/{}.json".format(time.time())
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(rows, f, ensure_ascii=False, indent=2)
        test_set, sc_list, label_list = self.data.load_file(filename,
                                                            train=False)

        token_list = []
        for line in sc_list:
            tokens = self.data.tokenizer.convert_ids_to_tokens(line)
            token_list.append(tokens)

        data_loader_test = DataLoader(test_set,
                                      batch_size=self.config.batch_size,
                                      shuffle=False)

        # 3. Evaluate
        answer_list, length_list = evaluate(self.model,
                                            data_loader_test,
                                            self.device,
                                            isTest=True)

        def flatten(ll):
            return list(itertools.chain(*ll))

        # train_answers = handy_tool(label_list, length_list) #gold
        # #answer_list = handy_tool(answer_list, length_list) #prediction
        # train_answers = flatten(train_answers)
        # train_predictions = flatten(answer_list)
        #
        # train_acc, train_f1 = calculate_accuracy_f1(
        #     train_answers, train_predictions)
        # print(train_acc, train_f1)
        # test_json = json.load(open(config.test_file_path, 'r', encoding='utf-8'))
        # id_list = [item['id'] for item in test_json]

        mod_tokens_list = handy_tool(token_list, length_list)
        result = [
            result_to_json(t, s) for t, s in zip(mod_tokens_list, answer_list)
        ]
        entity_result = [res["entities"] for res in result]
        return {"data": entity_result}
Ejemplo n.º 6
0
def main(out_file='output/result.json', model_config='config/rnn_config.json'):
    """Test model for given test set on 1 GPU or CPU.

    Args:
        in_file: file to be tested
        out_file: output file
        model_config: config file
    """
    # 0. Load config
    with open(model_config) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
    if torch.cuda.is_available():
        device = torch.device('cuda')
        # device = torch.device('cpu')
    else:
        device = torch.device('cpu')

    #0. preprocess file
    # id_list = []
    # with open(in_file, 'r', encoding='utf-8') as fin:
    #     for line in fin:
    #         sents = json.loads(line.strip())
    #         id = sents['id']
    #         id_list.append(id)
    # id_dict = dict(zip(range(len(id_list)), id_list))

    # 1. Load data
    data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'),
                max_seq_len=config.max_seq_len,
                model_type=config.model_type,
                config=config)
    test_set, sc_list, label_list = data.load_file(config.test_file_path,
                                                   train=False)

    token_list = []
    for line in sc_list:
        tokens = data.tokenizer.convert_ids_to_tokens(line)
        token_list.append(tokens)

    data_loader_test = DataLoader(test_set,
                                  batch_size=config.batch_size,
                                  shuffle=False)
    # 2. Load model
    model = MODEL_MAP[config.model_type](config)
    model = load_torch_model(model,
                             model_path=os.path.join(config.model_path,
                                                     'model.bin'))
    model.to(device)
    # 3. Evaluate
    answer_list, length_list = evaluate(model,
                                        data_loader_test,
                                        device,
                                        isTest=True)

    def flatten(ll):
        return list(itertools.chain(*ll))

    # train_answers = handy_tool(label_list, length_list) #gold
    # #answer_list = handy_tool(answer_list, length_list) #prediction
    # train_answers = flatten(train_answers)
    # train_predictions = flatten(answer_list)
    #
    # train_acc, train_f1 = calculate_accuracy_f1(
    #     train_answers, train_predictions)
    # print(train_acc, train_f1)
    test_json = json.load(open(config.test_file_path, 'r', encoding='utf-8'))
    id_list = [item['id'] for item in test_json]

    mod_tokens_list = handy_tool(token_list, length_list)
    result = [
        result_to_json(t, s) for t, s in zip(mod_tokens_list, answer_list)
    ]

    # 4. Write answers to file
    with open(out_file, 'w', encoding='utf8') as fout:
        result_list = []
        for id, item in zip(id_list, result):
            entities = item['entities']
            words = [
                d['word'] + "-" + d['type'] for d in entities
                if d['type'] != 's'
            ]
            unique_words = []
            for w in words:
                if w not in unique_words:
                    unique_words.append(w)
            item = {}
            item['id'] = id
            item['entities'] = unique_words
            result_list.append(item)
        json.dump(result_list, fout, ensure_ascii=False, indent=4)