def process_from_json(file_path, nlp_model): """ 从json文件中读入数据 :param file_path: json file path :param nlp_model: :return: """ try: # mongo = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB, MONGODB_COLLECTION) es = esConnector(url=ES_URL, index=ES_INDEX, doc_type=ES_DOC_TYPE) with open(file_path, 'rb') as f: string = f.read() record = json.loads(string) document_model = documentExtraction(record, nlp_model) if not es.check_info_exist(document_model.title): logger.info('begin extract doc %s...' % document_model.title) document_info = document_model.extract_knowledge_from_record() if len(document_info.keys()): es.insert_single_info(document_info) else: logger.warn('extract document info failed ,skip es store') else: logger.info('doc %s exist in es, skip' % document_model.title) except Exception, e: logger.error( 'document extraction process from json file failed for %s' % str(e))
def rule_notice_attach(self, source_info): """ 提取附件关系的规则 :param source_info: :return: bool, string, list """ try: link_info = list() info = source_info.get('_source', {}) source_id = source_info.get('_id', '') if len(info.get('attachment_file', [])): for attachment_file in info.get('attachment_file'): search_name = attachment_file[: - 1 * (len(attachment_file.split('.')[-1]) + 1)] _id_list, _title_list = self.es_db.search_id_from_title(search_name) for _id, _title in zip(_id_list, _title_list): if _id != source_id and _title == search_name: link_info.append({ 'source': source_id, 'target': _id, 'sourceType': 'id', 'targetType': 'id' }) else: pass if len(link_info): return True, 'attach', link_info else: return False, '', [] else: return False, '', [] except Exception, e: logger.error('searching attach relation attach failed for %s' % str(e)) return False, '', []
def rule_file_from(self, source_info): """ 提取附件从属的规则 :param source_info: :return: bool, string, list """ try: link_info = list() info = source_info.get('_source', {}) source_id = source_info.get('_id', '') if len(info.get('parrent_file', '')): search_name = info.get('parrent_file') _id_list, _title_list = self.es_db.search_id_from_title(search_name) for _id, _title in zip(_id_list, _title_list): if _id != source_id and _title == search_name: link_info.append({ 'source': source_id, 'target': _id, 'sourceType': 'id', 'targetType': 'id' }) if len(link_info): return True, 'from', link_info else: return False, '', [] else: return False, '', [] except Exception, e: logger.error('searching attach relation from failed for %s' % str(e)) return False, '', []
def rule_doc_entity(self, source_info): """ :param source_info: :return: """ try: link_info = list() info = source_info.get('_source', {}) source_id = source_info.get('_id', '') entity_name = info.get('entity_name', []) entity_org = info.get('entity_org', []) entity_loc = info.get('entity_loc', []) entity_list = entity_name + entity_org + entity_loc for seg in entity_list: link_info.append({ 'source': source_id, 'target': seg, 'sourceType': 'id', 'targetType': 'seg' }) if len(link_info): return True, 'include', link_info else: return False, '', [] except Exception, e: logger.error('searching entity relation failed for %s' % str(e)) return False, '', []
def run(self): """ """ logger.info('begin crawler..') try: self._run() except Exception, e: logger.error('star crawler failed for %s, stop crawler' % str(e)) sys.exit(1)
def rule_doc_quote(self, source_info): """ 提取文档的引用关系,包括idendify和文件的引用 :param source_info: :return: """ try: link_info = list() info = source_info.get('_source', {}) source_id = source_info.get('_id', '') source_identify = info.get('identify', '') source_quote = info.get('quote_title', []) + info.get('quote_content', []) source_file = list() source_quote_file = list() # can use counter for item in info.get('quote_title', []): if item not in source_file: source_file.append(item) for item in info.get('quote_content', []): if item not in source_file: source_file.append(item) for item in source_quote: if item not in source_quote_file: source_quote_file.append(item) # seaching if len(source_identify): _id_list = self.es_db.search_id_list_from_identify(source_identify) else: _id_list = [] for _id in _id_list: if _id != source_id: link_info.append({ 'source': source_id, 'target': _id, 'sourceType': 'id', 'targetType': 'id' }) for quote_file in source_quote_file: _id_list = self.es_db.search_id_list_from_filename(quote_file) for _id in _id_list: if _id != source_id: link_info.append({ 'source': source_id, 'target': _id, 'sourceType': 'id', 'targetType': 'id' }) if len(link_info): return True, 'quote', link_info else: return False, '', [] except Exception, e: logger.error('searching attach relation quote failed for %s' % str(e)) return False, '', []
def _save_json(self, content, file_path): """ 存储json文件 :param file_path: :return: """ try: with open(file_path, 'wb') as f: f.write(json.dumps(content, ensure_ascii=False, indent=4)) except Exception, e: logger.error('write json file failed for %s' % str(e))
def _extract_keyword_from_doc(self): """ 提取文档关键词 :return: """ try: doc = self.title + self.content key_word_model = TextSummary4Seg(doc, 6, 0.85, 700, self.model) return key_word_model.top_n_seg(5) except Exception, e: logger.error('extract key word from doc failed for %s' % str(e)) return []
def _load_json(self, file_path): """ 加载json文件 :param file_path: :return: """ try: with open(file_path, 'rb') as f: content = f.read() return json.loads(content) except Exception, e: logger.error('load json file failed for %s' % str(e)) return None
def _extract_abstract_from_doc(self, seperated=False): """ 提取文档摘要,目前使用抽取式,不使用生成式摘要 :param seperated: 是否对文档内容分段进行摘要提取 :return: """ try: doc = self.title + self.content key_sentence_model = TextSummary4Sentence(doc, 700, 0.85, self.model) return key_sentence_model.top_n_sentence(3) except Exception, e: logger.error('extract abstract from doc failed for %s' % str(e)) return []
def _create_entity_node(self, result_info): """ 建立图中的 :param result_info: :return: """ try: entity_cache_list = list() for doc_info in result_info: info = doc_info['_source'] entity_name = info.get('entity_name', []) entity_org = info.get('entity_org', []) entity_loc = info.get('entity_loc', []) for seg in entity_name: if seg not in entity_cache_list: entity_info = { 'entity_type': 'name', 'seg': seg } self.neo4j_db.create_entity_node(entity_info) logger.info('create name entity node of %s' % seg) entity_cache_list.append(seg) else: continue for seg in entity_org: if seg not in entity_cache_list: entity_info = { 'entity_type': 'org', 'seg': seg } self.neo4j_db.create_entity_node(entity_info) logger.info('create organization entity node of %s' % seg) entity_cache_list.append(seg) else: continue for seg in entity_loc: if seg not in entity_cache_list: entity_info = { 'entity_type': 'loc', 'seg': seg } self.neo4j_db.create_entity_node(entity_info) logger.info('create location entity node of %s' % seg) entity_cache_list.append(seg) else: continue except Exception, e: logger.error('create entity node failed for %s' %str(e))
def save_attachement_file(self, attachment_file_link, attachment_file_name): """ 保存附件文件 :param attachment_file_link: :return: """ try: response = self.get(attachment_file_link) with open(os.path.join(SAVING_PATH, attachment_file_name), 'wb') as f: logger.info('saving file %s' % attachment_file_name) f.write(response) except Exception, e: logger.error('saving attachment file failed for %s' % str(e))
def _extract_public_org_2(self): """ 针对中央财政部,提取二级发布部门 位置在网址链接中http后的第一级字段 :return: """ try: link = self.record.get('noticeLink', '') link_start = re.findall(self.link_pattern, link)[0] second_org = CENTER_DEPARTMENT.get(link_start, '') return second_org except Exception, e: logger.error('extract public organization level 2 failed for %s' % str(e)) return ''
def _extract_filename_from_title(self): """ 从标题中提取出文件名列表 :return: """ try: filename_list = list() # doc = self.record.get('noticeTitle', '') for string in re.findall(self.file_pattern, self.title): if string not in filename_list: filename_list.append(string) return filename_list except Exception, e: logger.error('find file name from title failed for %s' % str(e)) return []
def _extract_filename_from_doc(self): """ 从中文中提取出文件名 :return: """ try: filename_list = list() # doc = self.record.get('noticeContent', '') for string in re.findall(self.file_pattern, self.content): if string not in filename_list: filename_list.append(string) return filename_list except Exception, e: logger.error('find file name from doc failed for %s' % str(e)) return []
def _check_info_exist(self, title): """ 判断 title 的信息是否已经在数据库中 :param title: :return: """ try: result = self.mongo.collection.find({'noticeTitle': title}) try: result[0] return True except: return False except Exception, e: logger.error('check title failed for %s' % str(e))
def build_graph_by_id(self, id): """ 建立固定文档的图连接 :param id: :return: """ try: doc_result = self.es_db.search_doc_by_id(id) doc_result_info = doc_result['hits']['hits'] self._create_doc_node(doc_result_info) self._create_entity_node(doc_result_info) # result = self.es_db.search_all(size=10000) # result_info = result['hits']['hits'] self._create_entity_node(doc_result_info) except Exception, e: logger.error('build graph by id failed for %s' % str(e))
def initial(self): """ 建立图数据库的主运行函数 数据读取来自于es的存储数据 :return: """ try: result = self.es_db.search_all(size=10000) result_info = result['hits']['hits'] self._create_doc_node(result_info) self._create_entity_node(result_info) # self._create_node_relationship(result_info, [self.rule_doc_explain, self.rule_doc_quote]) self._create_node_relationship(result_info, self.rule_list) except Exception, e: logger.error('build graph failed for %s' % str(e))
def rule_doc_trans(self, source_info): """ 转发文件的关系提取 :param source_info: :return: """ try: link_info = list() info = source_info.get('_source', {}) source_id = source_info.get('_id', '') title = info.get('title', '') if '转发' in title: pass else: return False, '', [] if len(info.get('title', [])): search_title = info.get('title').replace('转发', '') _id_list, _ = self.es_db.search_id_from_title(search_title) for _id in _id_list: if _id != source_id: link_info.append({ 'source': source_id, 'target': _id, 'sourceType': 'id', 'targetType': 'id' }) # for quote_file in info.get('quote_title'): # _id_list = self.es_db.search_id_list_from_filename(quote_file) # for _id in _id_list: # if _id != source_id: # link_info.append({ # 'source': _id, # 'target': source_id, # 'sourceType': 'id', # 'targetType': 'id' # }) if len(link_info): return True, 'trans', link_info else: return False, '', [] else: return False, '', [] except Exception, e: logger.error('searching doc explain relationship failed for %s' % str(e)) return False, '', []
def preprocess_for_data(): """ 将文本转换为模型输入的前处理流程 :return: """ try: train_sentences = load_sentence_file(FLAGS.train_file, FLAGS.zeros) dev_sentences = load_sentence_file(FLAGS.dev_file, FLAGS.zeros) test_sentences = load_sentence_file(FLAGS.test_file, FLAGS.zeros) # change tag schema in sentence trans_tag_schema(train_sentences, FLAGS.tag_schema) trans_tag_schema(test_sentences, FLAGS.tag_schema) # loading/writing mapping file if not os.path.isfile(FLAGS.map_file): logger.info('mapping file does not exist, create mapping file') if FLAGS.pre_emb: pass else: char_count_dic, id_to_char, char_to_id = char_mapping( train_sentences, FLAGS.lower) tag_count_dic, id_to_tag, tag_to_id = tag_mapping(train_sentences) with open(FLAGS.map_file, 'wb') as f: # notice pickle file format with py2 and py3 pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: pass logger.info('loading mapping file') with open(FLAGS.map_file, 'rb') as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare model data set # format data --- [[char_list, char_id_list, seg_id_list, tags_id_list],[]] # seg_id_list example: [X/XX/XXX/XXXX] -> [0 /1 3 /1 2 3 /1 2 2 3] train_data = prepare_model_data(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_model_data(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_model_data(test_sentences, char_to_id, tag_to_id, FLAGS.lower) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) return train_manager, dev_manager, test_manager except Exception, e: logger.error('pre-process for train string failed for %s' % str(e))
def train(): """ 训练模块 :return: """ try: # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True train_manager, dev_manager, test_manager = preprocess_for_data() logger.info('loading mapping file') with open(FLAGS.map_file, 'rb') as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = build_config(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) # steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = initial_ner_model(sess, NER_MODEL, FLAGS.ckpt_path, load_word2vec, config, id_to_char) logger.info("start training NER model") loss = [] # epoch iterate for i in range(FLAGS.max_epoch): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] save_model(sess, model, FLAGS.ckpt_path) # evaluate result for stop epoch iter # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) # if best: # save_model(sess, model, FLAGS.ckpt_path, logger) # evaluate(sess, model, "test", test_manager, id_to_tag, logger) except Exception, e: logger.error('training model process failed for %s' % str(e))
def _extract_identify_from_doc(self): """ 从文件正文中提取出文件标示编号, 目前的文件编号格式为:财会〔2018〕20号 :return: """ try: doc = self.content identify_list = re.findall(self.identify_pattern, doc) if len(identify_list): return identify_list else: logger.warn('doc do not have file identify') return [] except Exception, e: logger.error('extract file identify from doc failed for %s' % str(e))
def save_notice_info(self, notice_info): """ :param notice_info: :return: """ try: if not self._check_info_exist(notice_info['noticeTitle']): logger.info('insert notice info...') self.mongo.collection.insert_one(notice_info) else: logger.info('update notice info...') self.mongo.collection.find_one_and_update( {'noticeTitle': notice_info['noticeTitle']}, {'$set': notice_info}) except Exception, e: logger.error('mongoDB store notice info failed for %s' % str(e))
def _create_node_relationship(self, result_info, rule_list): """ 根据规则建立节点间的链接关系 :param result_info: :return: """ try: for source_info in result_info: # begin match rules logger.info('extract file with id %s' % str(source_info.get('_id',''))) for rule in rule_list: is_match, relationship_type, relationship_info = rule(source_info) if is_match: logger.info('matching rule %s'%rule.__name__) self.neo4j_db.create_relation(relationship_type, relationship_info) else: pass except Exception, e: logger.error('extract relationship between nodes failed for %s' % str(e))
def _create_doc_node(self, result_info): """ 建立图中的文档节点 :result_info: es中查询结果 :return: """ try: for doc_info in result_info: doc_analysis = self._doc_info_analysis(doc_info) if doc_analysis: if not self.neo4j_db.check_node_exist(doc_analysis): self.neo4j_db.create_doc_node(doc_analysis) logger.info('create node...') else: logger.info('node is existed, skip') else: logger.warn('analysis doc info failed ,skip...') except Exception, e: logger.error('create doc node failed for %s' %str(e))
def _analysis_table_data(self, table_info, zb_key, reg_key): """ table_info format { 'datanodes':[{ u'code': u'zb.A080101_reg.110000_sj.2017', u'data': {u'dotcount': 2, u'data': 5430.79, u'strdata': u'5430.79', u'hasdata': True}, u'wds': [{u'wdcode': u'zb', u'valuecode': u'A080101'}, {u'wdcode': u'reg', u'valuecode': u'110000'}, {u'wdcode': u'sj', u'valuecode': u'2017'}] },# table data ], 'wdnodes':[{zb node info}, {reg node info}, {sj node info}] } :param table_info: :return: """ try: # for saving json data_list = list() zb_node_dict = dict() for zb_node in table_info['returndata']['wdnodes'][0]['nodes']: zb_node_dict[zb_node['code']] = { 'name': zb_node['cname'], 'des': zb_node.get('exp', '') + zb_node.get('memo', ''), 'unit': zb_node.get('exp', '') } for _data_info in table_info['returndata']['datanodes']: data_info = { 'id': _data_info['wds'][0]['valuecode'], 'mainKey': zb_key, 'location': reg_key, 'key': zb_node_dict[_data_info['wds'][0]['valuecode']]['name'], 'value': _data_info['data']['data'], 'year': _data_info['wds'][2]['valuecode'], 'unit': zb_node_dict[_data_info['wds'][0]['valuecode']]['unit'] } self._save_data(data_info) data_list.append(data_info) # self._save_json(json.dumps(data_list, ensure_ascii=False, indent=4), '../data/%s.json'%) return data_list except Exception, e: logger.error('analysis table data failed for %s' % str(e))
def extract_knowledge_from_record(self): """ 抽取主函数 :return: """ try: # 数据库抽取 entity_list = self._extract_entity_from_record() knowledge_body = { # debug 存入es中时时间字段不能为空,为空时不能进行数据的插入 # #爬虫数据中有可能没有时间字段,可由content_attach中进行抽取,暂时不进行,进行默认时间的录入 # cc 2018-09-23 'publish_time': self.record.get('publishTime', '') if len(self.record.get('publishTime', '')) else '2018-08-01', 'publish_location': self.record.get('location', ''), 'publish_org': LOCATION_ORG_DICT.get(self.record.get('location', ''), ''), 'publish_org_2': self._extract_public_org_2(), 'title': self.title, 'category': self.record.get('category', ''), 'classify': '', 'content': self.content, 'identify': self.record.get('noticeIdentify', ''), 'content_identify': self._extract_identify_from_doc(), 'content_attach': self.record.get('noticeAttachment', ''), 'quote_title': self._extract_filename_from_title(), 'quote_content': self._extract_filename_from_doc(), 'entity_loc': [item[0] for item in entity_list if item[1] == 'ns'], 'entity_org': [item[0] for item in entity_list if item[1] == 'ni'], 'entity_name': [item[0] for item in entity_list if item[1] == 'np'], 'attachment_file': self.record.get('attachmentFileList', []) if not self.file_name else [], 'parrent_file': self.record.get('noticeTitle', '') if self.file_name else '', 'key_word': [item[0] for item in self._extract_keyword_from_doc()] \ if self.type not in ['xls', 'xlsx'] else [], 'abstract': [item[0] for item in self._extract_abstract_from_doc()] \ if self.type not in ['xls', 'xlsx'] else [], 'data_key': [], 'data': {} } return knowledge_body except Exception, e: logger.error('extract knowledge from record failed for %s' % str(e)) return {}
def _search_time_from_title(self, title): """ :param title: :return: """ try: pattern = re.compile('('.decode('utf-8') + u'(.*)' + ')'.decode('utf-8')) for str in re.findall(pattern, title): try: datetime.datetime.strptime(str, '%Y-%m-%d') return str except: continue logger.warn('do not find time str..') return '' except Exception, e: logger.error('searching time string failed for %s' % str(e)) return ''
def _check_info_exist(self, id, year, reg): """ 判断info 是否存在 :param zb_key: 指标的名称 :param reg_key: 地点的名称 :return: bool """ try: result = self.mongo.collection.find({ 'id': id, 'year': year, 'location': reg }) try: result[0] return True except: return False except Exception, e: logger.error('check title failed for %s' % str(e))
def __get_content_title(self): """ 统一取出content和title,需要多次使用 :return: """ try: if not self.file_name: self.title = self.record.get('noticeTitle', '') self.content = self.record.get('noticeContent', '') self.type = 'notice' else: # 去掉文件名中的空格,文件格式转换时做了空格的消除 self.file_name = self.__pre_deal_with_str(self.file_name) if len(self.file_name.split('.')) >= 2: self.title = self.file_name.split('.')[-2] else: self.title = self.file_name file_type = self.file_name.split('.')[-1] if file_type in ['xls', 'xlsx']: trans_file_type = 'csv' else: trans_file_type = 'txt' trans_file_name = self.file_name[:-1 * (len(file_type) + 1)] + '.' + trans_file_type if os.path.isfile(os.path.join(FILE_PATH, trans_file_name)): logger.info('reading file %s' % trans_file_name) with open(os.path.join(FILE_PATH, trans_file_name), 'r') as f: self.content = f.read() self.type = file_type else: logger.warn('file %s do not have trans file' % trans_file_name) self.content = '' self.type = '' except Exception, e: logger.error('get content and title string failed for %s' % str(e)) self.title = '' self.content = '' self.type = ''