Example #1
0
# Created time: 2019/1/29 21:51
# File usage: 多任务联合意图解析模型,输入为query,输出为tag与label


import os
import collections

import pandas as pd
import tensorflow as tf

import constant
import modeling
import optimization
import tokenization

logger = constant.get_logger("joint_model")

flags = tf.flags
FLAGS = flags.FLAGS

# 输入文件格式为包含query,tag与label三列的制表符分隔csv文件
# 其中query为未分词的句子,tag对应query中的每个字符,以空格分隔
# 一行数据形式如:招行的总部在哪?\tB-E I-E O B-R I-R O O\tInterpret

flags.DEFINE_string("query_file", "data/query/query.txt", "Input query file")

flags.DEFINE_string("bert_config_file", "model/chinese_L-12_H-768_A-12/bert_config.json",
                    "The config json file corresponding to the pre-trained BERT model. "
                    "This specifies the model architecture.")

flags.DEFINE_string("vocab_file", "model/chinese_L-12_H-768_A-12/vocab.txt",
# -*- coding: utf-8 -*-

# Author: Dandy Qi
# Created time: 2019/1/17 18:09
# File usage: 关系判别模型,输入为query的文本表示与relation的文本表示,输出为相应打分

import tensorflow as tf
import pandas as pd
import numpy as np
import collections
import os

import constant

logger = constant.get_logger("relation_classification")

flags = tf.flags
FLAGS = flags.FLAGS

flags.DEFINE_string("relation_file", "data/query/query_relation.csv",
                    "The path of relation train data")
flags.DEFINE_string("output_dir", "model/relation_classification",
                    "The path for saving model")
flags.DEFINE_integer("valid_size", 400, "The size of valid size")
flags.DEFINE_integer("max_seq_length", 128, "The size of valid size")
flags.DEFINE_integer("width", 768, "The size of valid size")
flags.DEFINE_float("dropout_prob", 0.4, "The probability of dropout")
flags.DEFINE_integer("save_checkpoint_steps", 100,
                     "The num of steps to save checkpoint")
flags.DEFINE_integer("save_summary_steps", 10,
                     "The num of steps to save summary")
Example #3
0
# -*- coding: utf-8 -*-

# Author: Dandy Qi
# Created time: 2019/1/6 15:24
# File usage: 实体分类模型,输入为词条的摘要、信息框属性与原始类别,输出为实体对应类别

import tensorflow as tf
import pandas as pd
import collections
import os

import data_utils
import constant

logger = constant.get_logger("entity_classification")
flags = tf.flags

FLAGS = flags.FLAGS

flags.DEFINE_string("train_file", "data/baike/train_data", "Input train file")
flags.DEFINE_string("dev_file", "data/baike/dev_data", "Input dev file")
flags.DEFINE_string("test_file", "data/baike/test_data", "Input test file")
flags.DEFINE_string("output_dir", "model/entity_classification",
                    "Output directory for model checkpoints")
flags.DEFINE_string("token_vocab", "data/baike/token_vocab",
                    "Path to token vocab")
flags.DEFINE_string("attribute_vocab", "data/baike/attribute_vocab",
                    "Path to attribute vocab")
flags.DEFINE_string("category_vocab", "data/baike/category_vocab",
                    "Path to category vocab")
# coding=utf-8

# Author: Dandy Qi
# Created time: 2019/1/12 16:34
# File usage: 生成文本表示预训练数据,在BERT基础上增加对实体词的特殊流程

import collections
import random
import tensorflow as tf
import tokenization

import constant

logger = constant.get_logger("create_pre_training_data")

flags = tf.flags

FLAGS = flags.FLAGS

flags.DEFINE_string("input_file", "data/bert/corpus",
                    "Input raw text file (or comma-separated list of files).")

flags.DEFINE_string(
    "output_file", "data/bert/pre_training_data",
    "Output TF example file (or comma-separated list of files).")

flags.DEFINE_string("vocab_file", "model/chinese_L-12_H-768_A-12/vocab.txt",
                    "The vocabulary file that the BERT model was trained on.")

flags.DEFINE_string("entity_vocab_file", "data/KG/entity_vocab",
                    "The vocabulary file that the BERT model was trained on.")
# -*- coding: utf-8 -*-

# Author: Dandy Qi
# Created time: 2019/1/13 20:08
# File usage: 实体信息补全,包含信息框属性抽取,依存句法分析与关联规则

import pandas as pd
import collections
from itertools import chain, combinations

import constant
import data_utils
from nlp_parser import SentenceParser

logger = constant.get_logger("entity_complement")


def get_corpus_in_sentences(doc):
    sentences = [data_utils.clean_text(s) for s in doc.split("。")]
    return sentences


def stats_required_relation(data: pd.DataFrame):
    relation_set = {
        "机构": collections.OrderedDict(),
        "概念": collections.OrderedDict(),
        "人物": collections.OrderedDict(),
        "图书": collections.OrderedDict()
    }

    entity_counter = {"机构": 0, "概念": 0, "人物": 0, "图书": 0}
Example #6
0
# Created time: 2018/12/28 17:13
# File usage: 若干实验的数据处理过程

import collections
import numpy as np
import pandas as pd
import unicodedata
import re

import constant
import jsonlines

MAX_TOKEN_LENGTH = 120
MAX_ATTRIBUTE_LENGTH = 8

logger = constant.get_logger('create_training_data')


def is_control(char):
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False


def is_whitespace(char):
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
Example #7
0
# Created time: 2019/1/14 21:49
# File usage: 文本表示预训练任务,判断是否为实体添加了正确的关系

import os
import random
import collections

import pandas as pd
import tensorflow as tf

import constant
import modeling
import optimization
import tokenization

logger = constant.get_logger("knowledge_pre_training")

flags = tf.flags
FLAGS = flags.FLAGS

flags.DEFINE_string("knowledge_file", "data/KG/knowledge.csv",
                    "Input training data")
flags.DEFINE_string(
    "bert_config_file", "model/chinese_L-12_H-768_A-12/bert_config.json",
    "The config json file corresponding to the pre-trained BERT model. "
    "This specifies the model architecture.")
flags.DEFINE_string("vocab_file", "model/chinese_L-12_H-768_A-12/vocab.txt",
                    "The vocabulary file that the BERT model was trained on.")
flags.DEFINE_string(
    "output_dir", "model/knowledge_pretraining",
    "The output directory where the model checkpoints will be written.")