def return_data_from_mongodb(self,request): try: repo_id = request.session['repo_id'] #repo_id=1 except Exception: return self.error('没有知识库id') try: file_id = request.POST['file_id'] #file_id = 1 except Exception: return self.error('没有文件id') tmp_info = {'file_id': file_id} try: news_col = Mongodb(db='knowledge', collection='text').get_collection() except Exception: return self.error("mongodb没有数据库或者表") ret_entity_map = news_col.find(tmp_info) ret_list = [] for val in ret_entity_map: ret_list.append(val) category_name_list = [] ret_category = TCategory.objects.filter(repo_id=repo_id) for val in ret_category: val_dict = model_to_dict(val) category_name_list.append(val_dict['category_name']) ret_l = {'category_name': category_name_list, 'context': ret_list} print(ret_l) return render(request, 'test1.html', context=ret_l)
def save_mongodb_data_to_neo4j(self,request): try: #entity_id = request.POST['entity_id'] entity_id = ObjectId("5eb52fc9d03fe5b0f31b6f40") except Exception: return self.error("没有收到entity_id") try: #category_id = request.POST['category_id'] category_id=1 except Exception: return self.error("没有收到category_id") try: news_col = Mongodb(db='knowledge', collection='text').get_collection() except Exception: return self.error("mongodb没有数据库或者表") category_val = TCategory.objects.get(id=category_id) category_val_dict = model_to_dict(category_val) category_name = category_val_dict['category_name'] tmp_data={'_id':entity_id} ret_entity = news_col.find(tmp_data) for val in ret_entity: print(category_name,val,1) Neo4j().create_node_mjy_edition(category_name,val) ret_l = {} return render(request, 'test1.html', context=ret_l)
def get_data_source(): """ 获取已获取的电影人url :return: """ member_col = Mongodb(db='movies', collection='member').get_collection() url_set = set() for item in member_col.find(): url_set.add(item["douban_url"]) return url_set
def GetStatistics(spider_id, repo_id): collection = Mongodb(db='knowledge', collection='text').get_collection() count = collection.find({ "spider_id": spider_id, "repo_id": repo_id }).count() # comment_count = comments_collection.find({FieldName.DATA_WEBSITE: str(project.data_website), FieldName.DATA_REGION: str(project.data_region), FieldName.DATA_SOURCE: str(project.data_source)}).count() # try: # predict_comment_count = shops_collection.aggregate([{'$match': {FieldName.DATA_WEBSITE: str(project.data_website), FieldName.DATA_REGION: str(project.data_region), FieldName.DATA_SOURCE: str(project.data_source), FieldName.SHOP_COMMENT_NUM: {"$gt": 0}}}, {'$group': {"_id": "$%s"%FieldName.SHOP_URL, "num": {"$first": "$%s" % FieldName.SHOP_COMMENT_NUM}}}, {'$group': {"_id": None, "sum": {"$sum": "$num"}}}]).next().get('sum') # except Exception: # predict_comment_count = 0 curr_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) count_today = collection.find({ "spider_id": spider_id, "repo_id": repo_id, "value.crawl_time": { '$regex': curr_date } }).count() week_start = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time() - 7 * 24 * 3600)) count_week = collection.find({ "spider_id": spider_id, "repo_id": repo_id, "value.crawl_time": { '$gt': week_start } }).count() month_start = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time() - 30 * 24 * 3600)) count_month = collection.find({ "spider_id": spider_id, "repo_id": repo_id, "value.crawl_time": { '$gt': month_start } }).count() result = '数据:%6s条 今日:%6s条 本周:%6s条 本月:%s条' % (count, count_today, count_week, count_month) return result
from model.mongodb import Mongodb from bson import ObjectId if __name__ == '__main__': test_col = Mongodb(db='test', collection='test').get_collection() # for i in test_col.find(): # if "file_id" in i: # print(i["file_id"]) # test_col.update_one({"_id": i["_id"]}, {"$set": {"alexa": "12"}}) test = test_col.find({"file_id": 1, "value.test1": {"$exists": True}}) for item in test: print(item["_id"])
def update_t_mapping_rule(self, repo_id, create_id): #显然是要对每个类目进行计算 return_category = TCategory.objects.filter(repo_id=repo_id, create_id=create_id) category_name_list = [] category_id_list = [] for val in return_category: val_dict = model_to_dict(val) category_name_list.append(val_dict['category_name']) category_id_list.append(val_dict['id']) list_len = len(category_name_list) #print(list_len) #print(category_name_list) #print(category_id_list) for i in range(0, list_len): tmp_id = category_id_list[i] attribute_name_alias_map = {} return_attribute = TAttribute.objects.filter(category_id=tmp_id) #把这个函数封装一下不然就太长了 #输入是查询的所有attribute 返回一个map #attribute 不仅是自己的attribute 还有父亲节点的attribute #这边这个attribute_name_alias_map里面还要放进去他们夫妻节点的东西 attribute_name_alias_map = self.return_attribute_name_map( return_attribute, attribute_name_alias_map) #print(attribute_name_alias_map) ret_cate = TCategory.objects.get(id=tmp_id) ret_cate_dict = model_to_dict(ret_cate) father_category_id = ret_cate_dict['father_category_id'] #print(father_category_id,type(father_category_id)) if (str(-1) != father_category_id): return_attribute_father = TAttribute.objects.filter( category_id=father_category_id) attribute_name_alias_map = self.return_attribute_name_map( return_attribute_father, attribute_name_alias_map) ret_cate_father = TCategory.objects.get(id=father_category_id) ret_cate_dict_father = model_to_dict(ret_cate_father) father_father_category_id = ret_cate_dict_father[ 'father_category_id'] #print(father_father_category_id) if (str(-1) != father_father_category_id): return_attribute_father_father = TAttribute.objects.filter( category_id=father_father_category_id) attribute_name_alias_map = self.return_attribute_name_map( return_attribute_father_father, attribute_name_alias_map) #print(attribute_name_alias_map) news_col = Mongodb(db='knowledge', collection='text').get_collection() #print(list_len) # 这边所有的名字已经在attribute_name_alias_map里面 _insert_mapping_rule_attribute_name_list = [] _insert_mapping_rule_attribute_coverage_rate_list = [] category_id = category_id_list[i] #从mongodb里面找 attribute_name_map = {} tmp_list = news_col.find({'category_id': category_id}) num = 0 for val in tmp_list: #print(val) num += 1 if val is not None: for key in val.keys(): #print(key) #print(attribute_name_map) #print(key in attribute_name_map ) if (key == '_id' or key == 'file_id' or key == 'category_id'): continue elif (key in attribute_name_map): attribute_name_map[key] += 1 else: attribute_name_map[key] = 1 #这边这个在没有实体的时候 假如说t_mapping_rule里面有多余的值那么就要进行更新 #所以只要有一个就不用删除一旦一个都没有了 那么就不用删除 #如果你想写得再细致一点那么就确认这个属性还在 假如说不在了那么就删除 # delete_id_list = [] #print(111) return_mapping_rule = TMappingRule.objects.filter( category_id=category_id, create_id=create_id) #print(111) for rule in return_mapping_rule: rule_dict = model_to_dict(rule) print(rule_dict) rule_dict_id = rule_dict['id'] rule_dict_attribute_name = rule_dict['attribute_name'] if rule_dict_attribute_name not in attribute_name_map.keys(): delete_id_list.append(rule_dict_id) for mapping_rule_id in delete_id_list: #其实删除的话最好存到日志里面不然又会出问题 rule_mapping = TMappingRule.objects.get(id=mapping_rule_id) rule_mapping.delete() # #print(attribute_name_map) if attribute_name_map is not None: for key in attribute_name_map.keys(): if (key in attribute_name_alias_map): a = 1 else: _insert_mapping_rule_attribute_name_list.append(key) coverage_rate = 1.0 * attribute_name_map[key] / num _insert_mapping_rule_attribute_coverage_rate_list.append( coverage_rate) attribute_name_list_len = len( _insert_mapping_rule_attribute_name_list) dt = datetime.now() #print(attribute_name_list_len) #print(_insert_mapping_rule_attribute_name_list) for k in range(0, attribute_name_list_len): attribute_name_val = _insert_mapping_rule_attribute_name_list[ k] attribute_coverage_val = _insert_mapping_rule_attribute_coverage_rate_list[ k] obj = TMappingRule.objects.filter( attribute_name=attribute_name_val, create_id=create_id).first() #print(attribute_name_val,attribute_coverage_val) if (obj is None): # create TMappingRule.objects.create( attribute_name=attribute_name_val, coverage_rate=attribute_coverage_val, create_time=str(dt)[:19], category_id=category_id, create_id=create_id) else: # upadte obj.coverage_rate = attribute_coverage_val obj.create_time = str(dt)[:19] obj.save() return 1
class MaoyanSpider(Driver): def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False, spider_id='2'): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy) self.boxoffice_col = Mongodb(db='knowledge', collection='text').get_collection() self.news_col = Mongodb(db='movies1', collection='news').get_collection() @staticmethod def find_key_from_value(dict, value): key_list = dict.keys() for key in key_list: if value == dict[key]: return key return None def get_boxoffice_infos_from_one_page(self, url="", datetime="", user_id=-1, repo_id=-1): """ 获取猫眼此时刻票房数据 :param repo_id: :param user_id: :param datetime: :param url: :return: """ self.fast_new_page(url=url) time.sleep(1) if not self.judge_web_element_exist_by_css_selector( css_selector="div.dashboard-content"): self.close_curr_page() return True theads = self.until_presence_of_all_elements_located_by_css_selector( css_selector= "div.dashboard-list > table.dashboard-table.table-header > thead > tr > th" )[1:] theads = [item.text for item in theads] if not self.judge_web_element_exist_by_css_selector( css_selector= "div.movielist-container > div.movielist > table.dashboard-table > tbody > tr" ): self.close_curr_page() return False boxoffice_infos = self.until_presence_of_all_elements_located_by_css_selector( css_selector= "div.movielist-container > div.movielist > table.dashboard-table > tbody > tr" ) crwal_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) boxoffice_data_from_the_page = [] for item in boxoffice_infos: one_boxoffice_data = {} boxoffice_info = self.until_presence_of_all_elements_located_by_css_selector( css_selector="td", ele=item) movie_name = self.until_presence_of_element_located_by_css_selector( css_selector="div > div.moviename-desc > p.moviename-name", ele=boxoffice_info[0]) movie_info = self.until_presence_of_all_elements_located_by_css_selector( css_selector= "div > div.moviename-desc > p.moviename-info > span", ele=boxoffice_info[0]) one_boxoffice_data.setdefault("日期", datetime) one_boxoffice_data.setdefault("电影名", movie_name.text) one_boxoffice_data.setdefault("上映时间", movie_info[0].text) one_boxoffice_data.setdefault("总票房", movie_info[1].text) boxoffice_info = boxoffice_info[1:] for i in range(len(boxoffice_info)): one_boxoffice_data.setdefault(theads[i], boxoffice_info[i].text) one_boxoffice_data.setdefault("crawl_time", crwal_time) one_boxoffice_data.setdefault("crawl_from", "猫眼专业版") # self.piaofang_col.insert_one(one_piaofang_data) judge_result = self.judge_data_exist_by_keys( collection=self.boxoffice_col, keys={ "user_id": user_id, "repo_id": repo_id, "value.日期": one_boxoffice_data["日期"], "value.电影名": one_boxoffice_data["电影名"], "value.crawl_from": one_boxoffice_data["crawl_from"] }) if judge_result is True: boxoffice_data_from_the_page.append(one_boxoffice_data) else: return boxoffice_data_from_the_page, False self.close_curr_page() return boxoffice_data_from_the_page, True def get_boxoffice_infos(self, spider_id, user_id, repo_id, spider_name): date = datetime.datetime.strptime("2020-01-23", '%Y-%m-%d') # date = datetime.datetime.now() final_result = [] while True: data_list, result = self.get_boxoffice_infos_from_one_page( url="http://piaofang.maoyan.com/dashboard/movie?date=" + str(date)[:10], datetime=str(date)[:10], user_id=int(user_id), repo_id=int(repo_id)) final_result.extend(data_list) if result is False: break date = date + datetime.timedelta(days=-1) if len(final_result) == 0: return one_data_acquisition_log = TDataAcquisitionLog.objects.create( create_time=timezone.now(), data_source_name=spider_name, data_access="爬虫", repo_id=int(repo_id), create_id=int(user_id), data_path="") TEntityExtractionLog.objects.create( data_acquisition_id=one_data_acquisition_log.id, is_extract=0, entity_number=0, extract_time=timezone.now(), create_id=int(user_id), repo_id=int(repo_id)) for item in final_result: self.boxoffice_col.insert_one({ "file_id": one_data_acquisition_log.id, "category_id": -1, "spider_id": int(spider_id), "user_id": int(user_id), "repo_id": int(repo_id), "value": item }) def run_spider(self, url=""): lastest_info = self.boxoffice_col.find().sort("datetime", -1).limit(1) date = datetime.datetime.strptime(lastest_info[0]["datetime"], '%Y-%m-%d') date = date + datetime.timedelta(days=1) now = datetime.datetime.now() while date < now: self.get_boxoffice_infos_from_one_page( "http://piaofang.maoyan.com/dashboard/movie?date=" + str(date)[:10], str(date)[:10]) date = date + datetime.timedelta(days=1)
def eventExtraction(self, request, file_id, lEventCategoryId): """ 功能 进行模板匹配的事件抽取 :param request: request参数 :param file_id: 数据类型str 文件id :param lEventCategoryId: 数据类型list 事件类目id :return: True """ #加入ruleId 1或者2 #1的事件是三元组主谓宾 2的话变事件是主谓 #only for debug #request.session['user_id'] = 1 #request.session['repo_id'] = 1 #fileId = 13 #only for debug #fileId = request.POST['fileId'] #request.session['repo_id']=1 #request.session['user_id']=1 repoId = request.session['repo_id'] createId = request.session['user_id'] #存到这个file_id 里面 tmp_info = {'file_id': file_id, 'user_id': createId, 'repo_id': repoId} news_col = Mongodb(db='knowledge', collection='text').get_collection() cnt = 1 ret_entity = news_col.find(tmp_info) ret_entity_map = list() for item in ret_entity: if "内容" in item["value"]: ret_entity_map.append(item) if len(ret_entity_map) == 0: return print("--------------------事件抽取") #在这个之前把所有的词语都加进去 #整个循环都是为了把这个repoId的所有的触发词以及他们的事件主题客体都加入进去 retTriggerWordList = TTriggerWord.objects.filter(repo_id=repoId) eventLabelList = [] # hanlpUnit=HanlpUnit() #这边要修 我们要从事类目开始查询 for i in retTriggerWordList: tmpLableList = [] ruleId = 1 retTriggerWordDict = model_to_dict(i) triggerId = retTriggerWordDict['id'] eventId = retTriggerWordDict['event_rule_id'] #print(111,eventId) #触发词名字和触发词标注 retEventRule = TEventRule.objects.get(id=eventId) #print(333,retEventRule.category_id) retCategoryName = TCategory.objects.get( id=retEventRule.category_id).category_name #print(444,retCategoryName) #这里的时候触发词的label要变成事件的label #到时候改一下 triggerWord = retTriggerWordDict['trigger_word'] triggerWordId = BaseController.get_category_name( request, retCategoryName) #print(222,eventId) eventRule = TEventRule.objects.get(id=eventId, repo_id=repoId) eventRuleDict = model_to_dict(eventRule) eventCategoryId = eventRuleDict['category_id'] if (eventCategoryId not in lEventCategoryId): continue eventCategory = TCategory.objects.get(id=eventCategoryId, repo_id=repoId, create_id=createId) eventCategoryDict = model_to_dict(eventCategory) eventCategoryName = eventCategoryDict['category_name'] tmpLableList.append(eventCategoryName) #事件类目 subjectCategoryId = eventRuleDict['event_subject_id'] subjectCategory = TCategory.objects.get(id=subjectCategoryId, repo_id=repoId, create_id=createId) subjectCategoryDict = model_to_dict(subjectCategory) subjectCategoryName = subjectCategoryDict['category_name'] subjectId = BaseController.get_category_name( request, subjectCategoryName) tmpLableList.append(subjectId) retListId, retListVal = some_data_deal_func( ).inputCategoryIdReturnName(subjectCategoryId, repoId, createId) #对于retListVal里面的所有的值都把他们加入到分词器中然后进行分词 #构造wordList word 和mask 对应 constructWordList = [] tmpSet = self.hanlp_tool.added_word_list #print(len(retListVal )) for word in retListVal: if (word == None): continue tmpDict = {} tmpDict['word'] = word #print(word) #item["word"], item["mask"] tmpDict['mask'] = subjectId constructWordList.append(tmpDict) #这边这个要加入list[{'word':123,mask:13}] self.hanlp_tool.add_word_list(constructWordList) #print(constructWordList) objectCategoryId = eventRuleDict['event_object_id'] negativeOne = -1 if (objectCategoryId == negativeOne): ruleId = 2 constructWordList = [] tmpDict = {} tmpDict['word'] = triggerWord tmpDict['mask'] = str(triggerWordId) tmpSet = self.hanlp_tool.added_word_list constructWordList.append(tmpDict) self.hanlp_tool.add_word_list(constructWordList) tmpLableList.append(str(triggerWordId)) print(ruleId) if (ruleId == 1): objectCategoryId = eventRuleDict['event_object_id'] objectCategory = TCategory.objects.get(id=objectCategoryId, repo_id=repoId, create_id=createId) objectCategoryDict = model_to_dict(objectCategory) objectCategoryName = objectCategoryDict['category_name'] objectId = BaseController.get_category_name( request, objectCategoryName) retListId, retListVal = some_data_deal_func( ).inputCategoryIdReturnName(objectCategoryId, repoId, createId) tmpLableList.append(objectId) constructWordList = [] tmpSet = self.hanlp_tool.added_word_list #这个代码有变动需要改一下 for word in retListVal: if (word == None): continue tmpDict = {} tmpDict['word'] = word # item["word"], item["mask"] tmpDict['mask'] = str(objectId) constructWordList.append(tmpDict) # 这边这个要加入list[{'word':123,mask:13}] #print(constructWordList) self.hanlp_tool.add_word_list(constructWordList) eventLabelList.append(tmpLableList) #eventLabelList #事件类目 事件主题 事件触发词 事件客体 #print(eventLabelList) # print("list里面内容") # tmpS=self.hanlp_tool.added_word_list # for name in tmpS: # print(name) #print("list里面内容结束") #return True #name attribute = TAttribute.objects.get(category_id=1) attributeDict = model_to_dict(attribute) attributeName = attributeDict['attribute_name'] #print(self.hanlp_tool.added_word_list) cnt = 1 for i in ret_entity_map: _id = i['_id'] #根据这个id放回去就好了 value = i['value'] basetime = str(value['时间']) content = value['内容'] text = HanlpUnit().get_text_from_html(content) sentenceList = self.hanlp_tool.split_paragraph(text) #print(sentenceList) #这边把所有的东西都拿出来 event_extract_result = [] count = 0 countIndex = 0 #时间 地点 事件主体 事件客体 主体的类目 和客体的类目 tmpEventSet = set() for sent in sentenceList: sent = sent.strip() #print(sent) #对每一个sent进行分词获取他们的事件 #11111 #sent="浙江杭州明天林更新出演动作喜剧《快手枪手快枪手》" sentenceDealResult = self.hanlp_tool.cut(sent) event = self.eventExtractionByTemplateMatching( sent, eventLabelList) #事件抽取完成 #dateTime还要调整一下basetime会出问题 #print(basetime) dateTime = basetime timeIndex = -1 #print(123,timeIndex) timeIndex, timeWord, dateTime = Time_deal().dealTime( sent, basetime) if (timeIndex != -1): timeIndex = timeIndex + countIndex #print(46, timeIndex) #print(11111111,dateTime) locationList = Time_deal().dealArea(sent) location = '' locationindex = -1 for val in locationList: if (len(val['place']) > len(location)): location = val['place'] locationindex = val['index'] + countIndex #print(location,locationindex) countIndex += len(sentenceDealResult) #这三个的名字需要和事件一起返回 #print(event) for eve in event: ruleId = 1 if (len(eve) == 3): ruleId = 2 eveId = eve[0] subjectLabel = eventLabelList[eveId][1] #triggerLabel = BaseController.get_category_name()eventLabelList[eveId][0] attribute = {} attribute['发生时间'] = dateTime attribute['地点'] = location eveString = '' for j in range(1, len(eve), 1): eveString = eveString + str(eve[j]) attribute['名字'] = eveString #eventlabel要通过查询结果得到 eventLabel = BaseController.get_category_name( request, eventLabelList[eveId][0]) #print(eventLabel) #print(eventLabelList[eveId]) #print(event) subjectLabel = eventLabelList[eveId][1] Neo4j().create_node_mjy_edition(eventLabel, attribute) subjectNameVal = eve[1] # print(subjectCategoryName,attributeName,subjectNameVal) neo4jSubjectId = Neo4j().quesIdByLabelAttribute( subjectLabel, attributeName, '\'' + subjectNameVal + '\'') neo4jEventId = Neo4j().quesIdByLabelAttribute( eventLabel, '名字', '\'' + eveString + '\'') Neo4j().createRelationship(subjectLabel, eventLabel, "主谓关系", {'id': neo4jSubjectId}, {'id': neo4jEventId}) if (ruleId == 1): objectNameVal = eve[3] objectLabel = eventLabelList[eveId][3] neo4jObjectId = Neo4j().quesIdByLabelAttribute( objectLabel, attributeName, '\'' + objectNameVal + '\'') Neo4j().createRelationship(eventLabel, objectLabel, "动宾关系", {'id': neo4jEventId}, {'id': neo4jObjectId}) #print(neo4jSubjectId, neo4jEventId, neo4jObjectId) tmpEventDict = {} tmpEventDict['actual_event_time'] = dateTime #事件抽取内容拿出来 tmpEventDict['time'] = timeWord tmpEventDict['timeIndex'] = timeIndex tmpEventDict['location'] = location tmpEventDict['locationIndex'] = locationindex #print(111,dateTime,location) tmpEventDict['eventSubject'] = eve[1] tmpEventDict['eventSubjectLabel'] = subjectLabel tmpEventDict['triggerLabel'] = eventLabel tmpEventDict['triggerWord'] = eve[2] tmpEventDict['eventName'] = eveString if (ruleId == 1): tmpEventDict['eventObject'] = eve[3] objectLabel = eventLabelList[eveId][3] tmpEventDict['eventObjectLabel'] = objectLabel if (eveString not in tmpEventSet): tmpEventSet.add(eveString) event_extract_result.append(tmpEventDict) print(tmpEventDict) count += 1 #插入到mongodb #print(count,event_extract_result) news_col.update_one( {'_id': _id}, {"$set": { 'event_extract_result': event_extract_result }}) #news_col.insert_one() cnt += 1 #if(cnt>=2): # break return True
def extract_relationship_from_unstructured_data( self, request, file_id, relationship_attribute_list=None): """ 从非结构化数据中抽取关系 :param file_id:文件id,获取mongodb中对应要分析的数据 :param relationship_attribute_list:关系属性列表,所有使用该算法的关系属性id集合 :param request: :return: """ print("------------------------非结构关系抽取") tmp_info = { 'file_id': file_id, 'user_id': request.session["user_id"], 'repo_id': request.session["repo_id"] } collection = Mongodb(db='knowledge', collection='text').get_collection() ret_entity = collection.find(tmp_info) ret_entity_map = list() for item in ret_entity: if "内容" in item["value"]: ret_entity_map.append(item) if len(ret_entity_map) == 0 or relationship_attribute_list is None: print("无可抽取内容") return relationship_list = [] # all_category = TCategory.objects.filter(repo_id=request.session["repo_id"], create_id=request.session["user_id"], category_type=1) added_category_id = set() for attribute_id in relationship_attribute_list: cur_attribute = TAttribute.objects.get(id=attribute_id) category_from = TCategory.objects.get(id=cur_attribute.category_id) data_type = TDataType.objects.get(id=cur_attribute) category_to = TCategory.objects.get(id=data_type.category_id) category_from_name = BaseController.get_category_name( request, category_from.category_name) category_to_name = BaseController.get_category_name( request, category_to.category_name) one_relationship = list() one_relationship.append(cur_attribute.attribute_name) one_relationship.append(category_from_name) one_relationship.append( BaseController.get_category_name(request, cur_attribute.attribute_name)) one_relationship.append(category_to_name) relationship_list.append(one_relationship) self.hanlp_tool.add_word_list([{ "word": alia_item.attribute_alias, "mask": BaseController.get_category_name(request, cur_attribute.attribute_name) } for alia_item in TAttrbuteAlias.objects.filter( attribute_id=cur_attribute.id)]) if category_from.id not in added_category_id: ret_list_id, ret_list_val = some_data_deal_func( ).inputCategoryIdReturnName( categoryId=category_from.id, repoId=request.session["repo_id"], createId=request.session["user_id"]) self.hanlp_tool.add_word_list([{ "word": val_item, "mask": category_from_name } for val_item in ret_list_val]) added_category_id.add(category_from.id) if category_to.id not in added_category_id: ret_list_id, ret_list_val = some_data_deal_func( ).inputCategoryIdReturnName( categoryId=category_to.id, repoId=request.session["repo_id"], createId=request.session["user_id"]) self.hanlp_tool.add_word_list([{ "word": val_item, "mask": category_to_name } for val_item in ret_list_val]) added_category_id.add(category_to.id) # for category_item in all_category: # try: # one_data_type = TDataType.objects.get(category_id=category_item.id, repo_id=request.session["repo_id"], create_id=request.session["user_id"]) # attribute_list = TAttribute.objects.filter(data_type_id=one_data_type.id) # category_to_name = BaseController.get_category_name(request, category_item.category_name) # for attribute_item in attribute_list: # category_from = TCategory.objects.get(id=attribute_item.category_id) # category_from_name = BaseController.get_category_name(request, category_from.category_name) # one_relationship = list() # one_relationship.append(attribute_item.attribute_name) # one_relationship.append(category_from_name) # one_relationship.append(BaseController.get_category_name(request, attribute_item.attribute_name)) # one_relationship.append(category_to_name) # relationship_list.append(one_relationship) # self.hanlp_tool.add_word_list([{"word": alia_item.attribute_alias, # "mask": BaseController.get_category_name(request, # attribute_item.attribute_name)} # for alia_item in # TAttrbuteAlias.objects.filter(attribute_id=attribute_item.id)]) # print([{"word": alia_item.attribute_alias, # "mask": BaseController.get_category_name(request, # attribute_item.attribute_name)} # for alia_item in # TAttrbuteAlias.objects.filter(attribute_id=attribute_item.id)]) # if category_from.id not in added_category_id: # ret_list_id, ret_list_val = some_data_deal_func().inputCategoryIdReturnName(categoryId=category_from.id, repoId=request.session["repo_id"], createId=request.session["user_id"]) # self.hanlp_tool.add_word_list([{"word": val_item, "mask": category_from_name} for val_item in ret_list_val]) # added_category_id.add(category_from.id) # if category_item.id not in added_category_id: # ret_list_id, ret_list_val = some_data_deal_func().inputCategoryIdReturnName( # categoryId=category_item.id, repoId=request.session["repo_id"], # createId=request.session["user_id"]) # self.hanlp_tool.add_word_list( # [{"word": val_item, "mask": category_to_name} for val_item in ret_list_val]) # added_category_id.add(category_item.id) # except ObjectDoesNotExist: # continue neo4j = Neo4j() cout = 0 for i in ret_entity_map: _id = i['_id'] value = i['value'] content = value['内容'] text = HanlpUnit().get_text_from_html(content) sentenceList = self.hanlp_tool.split_paragraph(text) extract_relationship = [] for sent in sentenceList: sent = sent.strip() relationships = self.eventExtractionByTemplateMatching( sent, relationship_list) # relationships = self.eventExtractionByTemplateMatching(text.strip(), relationship_list) for item in relationships: relation_id = item[0] cur_relationship = relationship_list[relation_id] extract_relationship.append({ "object_from_category": cur_relationship[1], "object_to_category": cur_relationship[3], "object_from_name": item[1], "object_relationship_name": item[2], "object_to_name": item[3] }) object1 = neo4j.match( object_from={ "label_name": cur_relationship[1], "content": { "名字": item[1] } }) object2 = neo4j.match( object_from={ "label_name": cur_relationship[3], "content": { "名字": item[3] } }) if object1 is not None and len( object1) == 1 and object2 is not None and len( object2) == 1: neo4j.createRelationship(labelOne=cur_relationship[1], labelTwo=cur_relationship[3], relationShipName=item[2], propertyOne={"名字": item[1]}, propertyTwo={"名字": item[3]}) if "relationship_extract_result" in i: extract_relationship = self.merge_list( extract_relationship, i["relationship_extract_result"]) cout += 1 print( str(cout) + "个文章" + ",抽取数量:" + str(len(extract_relationship))) collection.update_one({"_id": ObjectId(_id)}, { "$set": { "relationship_extract_result": extract_relationship } })