# Created time: 2019/1/29 21:51 # File usage: 多任务联合意图解析模型,输入为query,输出为tag与label import os import collections import pandas as pd import tensorflow as tf import constant import modeling import optimization import tokenization logger = constant.get_logger("joint_model") flags = tf.flags FLAGS = flags.FLAGS # 输入文件格式为包含query,tag与label三列的制表符分隔csv文件 # 其中query为未分词的句子,tag对应query中的每个字符,以空格分隔 # 一行数据形式如:招行的总部在哪?\tB-E I-E O B-R I-R O O\tInterpret flags.DEFINE_string("query_file", "data/query/query.txt", "Input query file") flags.DEFINE_string("bert_config_file", "model/chinese_L-12_H-768_A-12/bert_config.json", "The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") flags.DEFINE_string("vocab_file", "model/chinese_L-12_H-768_A-12/vocab.txt",
# -*- coding: utf-8 -*- # Author: Dandy Qi # Created time: 2019/1/17 18:09 # File usage: 关系判别模型,输入为query的文本表示与relation的文本表示,输出为相应打分 import tensorflow as tf import pandas as pd import numpy as np import collections import os import constant logger = constant.get_logger("relation_classification") flags = tf.flags FLAGS = flags.FLAGS flags.DEFINE_string("relation_file", "data/query/query_relation.csv", "The path of relation train data") flags.DEFINE_string("output_dir", "model/relation_classification", "The path for saving model") flags.DEFINE_integer("valid_size", 400, "The size of valid size") flags.DEFINE_integer("max_seq_length", 128, "The size of valid size") flags.DEFINE_integer("width", 768, "The size of valid size") flags.DEFINE_float("dropout_prob", 0.4, "The probability of dropout") flags.DEFINE_integer("save_checkpoint_steps", 100, "The num of steps to save checkpoint") flags.DEFINE_integer("save_summary_steps", 10, "The num of steps to save summary")
# -*- coding: utf-8 -*- # Author: Dandy Qi # Created time: 2019/1/6 15:24 # File usage: 实体分类模型,输入为词条的摘要、信息框属性与原始类别,输出为实体对应类别 import tensorflow as tf import pandas as pd import collections import os import data_utils import constant logger = constant.get_logger("entity_classification") flags = tf.flags FLAGS = flags.FLAGS flags.DEFINE_string("train_file", "data/baike/train_data", "Input train file") flags.DEFINE_string("dev_file", "data/baike/dev_data", "Input dev file") flags.DEFINE_string("test_file", "data/baike/test_data", "Input test file") flags.DEFINE_string("output_dir", "model/entity_classification", "Output directory for model checkpoints") flags.DEFINE_string("token_vocab", "data/baike/token_vocab", "Path to token vocab") flags.DEFINE_string("attribute_vocab", "data/baike/attribute_vocab", "Path to attribute vocab") flags.DEFINE_string("category_vocab", "data/baike/category_vocab", "Path to category vocab")
# coding=utf-8 # Author: Dandy Qi # Created time: 2019/1/12 16:34 # File usage: 生成文本表示预训练数据,在BERT基础上增加对实体词的特殊流程 import collections import random import tensorflow as tf import tokenization import constant logger = constant.get_logger("create_pre_training_data") flags = tf.flags FLAGS = flags.FLAGS flags.DEFINE_string("input_file", "data/bert/corpus", "Input raw text file (or comma-separated list of files).") flags.DEFINE_string( "output_file", "data/bert/pre_training_data", "Output TF example file (or comma-separated list of files).") flags.DEFINE_string("vocab_file", "model/chinese_L-12_H-768_A-12/vocab.txt", "The vocabulary file that the BERT model was trained on.") flags.DEFINE_string("entity_vocab_file", "data/KG/entity_vocab", "The vocabulary file that the BERT model was trained on.")
# -*- coding: utf-8 -*- # Author: Dandy Qi # Created time: 2019/1/13 20:08 # File usage: 实体信息补全,包含信息框属性抽取,依存句法分析与关联规则 import pandas as pd import collections from itertools import chain, combinations import constant import data_utils from nlp_parser import SentenceParser logger = constant.get_logger("entity_complement") def get_corpus_in_sentences(doc): sentences = [data_utils.clean_text(s) for s in doc.split("。")] return sentences def stats_required_relation(data: pd.DataFrame): relation_set = { "机构": collections.OrderedDict(), "概念": collections.OrderedDict(), "人物": collections.OrderedDict(), "图书": collections.OrderedDict() } entity_counter = {"机构": 0, "概念": 0, "人物": 0, "图书": 0}
# Created time: 2018/12/28 17:13 # File usage: 若干实验的数据处理过程 import collections import numpy as np import pandas as pd import unicodedata import re import constant import jsonlines MAX_TOKEN_LENGTH = 120 MAX_ATTRIBUTE_LENGTH = 8 logger = constant.get_logger('create_training_data') def is_control(char): if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False def is_whitespace(char): if char == " " or char == "\t" or char == "\n" or char == "\r": return True cat = unicodedata.category(char)
# Created time: 2019/1/14 21:49 # File usage: 文本表示预训练任务,判断是否为实体添加了正确的关系 import os import random import collections import pandas as pd import tensorflow as tf import constant import modeling import optimization import tokenization logger = constant.get_logger("knowledge_pre_training") flags = tf.flags FLAGS = flags.FLAGS flags.DEFINE_string("knowledge_file", "data/KG/knowledge.csv", "Input training data") flags.DEFINE_string( "bert_config_file", "model/chinese_L-12_H-768_A-12/bert_config.json", "The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") flags.DEFINE_string("vocab_file", "model/chinese_L-12_H-768_A-12/vocab.txt", "The vocabulary file that the BERT model was trained on.") flags.DEFINE_string( "output_dir", "model/knowledge_pretraining", "The output directory where the model checkpoints will be written.")