Ejemplo n.º 1
0
    def return_data_from_mongodb(self,request):

        try:
            repo_id = request.session['repo_id']
            #repo_id=1
        except Exception:
            return self.error('没有知识库id')
        try:
            file_id = request.POST['file_id']
            #file_id  = 1
        except Exception:
            return self.error('没有文件id')
        tmp_info = {'file_id': file_id}
        try:
            news_col = Mongodb(db='knowledge', collection='text').get_collection()
        except Exception:
            return self.error("mongodb没有数据库或者表")

        ret_entity_map = news_col.find(tmp_info)
        ret_list = []
        for val in ret_entity_map:
            ret_list.append(val)
        category_name_list = []
        ret_category = TCategory.objects.filter(repo_id=repo_id)
        for val in ret_category:
            val_dict = model_to_dict(val)
            category_name_list.append(val_dict['category_name'])

        ret_l = {'category_name': category_name_list, 'context': ret_list}
        print(ret_l)
        return render(request, 'test1.html', context=ret_l)
Ejemplo n.º 2
0
    def save_mongodb_data_to_neo4j(self,request):
        try:
            #entity_id  = request.POST['entity_id']
            entity_id = ObjectId("5eb52fc9d03fe5b0f31b6f40")
        except Exception:
            return self.error("没有收到entity_id")
        try:
            #category_id = request.POST['category_id']
            category_id=1
        except Exception:
            return self.error("没有收到category_id")

        try:
            news_col = Mongodb(db='knowledge', collection='text').get_collection()
        except Exception:
            return self.error("mongodb没有数据库或者表")
        category_val =  TCategory.objects.get(id=category_id)
        category_val_dict = model_to_dict(category_val)
        category_name = category_val_dict['category_name']
        tmp_data={'_id':entity_id}
        ret_entity  = news_col.find(tmp_data)
        for val in ret_entity:
            print(category_name,val,1)
            Neo4j().create_node_mjy_edition(category_name,val)


        ret_l = {}
        return render(request, 'test1.html', context=ret_l)
Ejemplo n.º 3
0
 def get_data_source():
     """
     获取已获取的电影人url
     :return:
     """
     member_col = Mongodb(db='movies', collection='member').get_collection()
     url_set = set()
     for item in member_col.find():
         url_set.add(item["douban_url"])
     return url_set
Ejemplo n.º 4
0
 def GetStatistics(spider_id, repo_id):
     collection = Mongodb(db='knowledge',
                          collection='text').get_collection()
     count = collection.find({
         "spider_id": spider_id,
         "repo_id": repo_id
     }).count()
     # comment_count = comments_collection.find({FieldName.DATA_WEBSITE: str(project.data_website), FieldName.DATA_REGION: str(project.data_region), FieldName.DATA_SOURCE: str(project.data_source)}).count()
     # try:
     #     predict_comment_count = shops_collection.aggregate([{'$match': {FieldName.DATA_WEBSITE: str(project.data_website), FieldName.DATA_REGION: str(project.data_region), FieldName.DATA_SOURCE: str(project.data_source), FieldName.SHOP_COMMENT_NUM: {"$gt": 0}}}, {'$group': {"_id": "$%s"%FieldName.SHOP_URL, "num": {"$first": "$%s" % FieldName.SHOP_COMMENT_NUM}}}, {'$group': {"_id": None, "sum": {"$sum": "$num"}}}]).next().get('sum')
     # except Exception:
     #     predict_comment_count = 0
     curr_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
     count_today = collection.find({
         "spider_id": spider_id,
         "repo_id": repo_id,
         "value.crawl_time": {
             '$regex': curr_date
         }
     }).count()
     week_start = time.strftime('%Y-%m-%d %H:%M:%S',
                                time.localtime(time.time() - 7 * 24 * 3600))
     count_week = collection.find({
         "spider_id": spider_id,
         "repo_id": repo_id,
         "value.crawl_time": {
             '$gt': week_start
         }
     }).count()
     month_start = time.strftime(
         '%Y-%m-%d %H:%M:%S', time.localtime(time.time() - 30 * 24 * 3600))
     count_month = collection.find({
         "spider_id": spider_id,
         "repo_id": repo_id,
         "value.crawl_time": {
             '$gt': month_start
         }
     }).count()
     result = '数据:%6s条 今日:%6s条 本周:%6s条 本月:%s条' % (count, count_today,
                                                  count_week, count_month)
     return result
Ejemplo n.º 5
0
from model.mongodb import Mongodb
from bson import ObjectId

if __name__ == '__main__':
    test_col = Mongodb(db='test', collection='test').get_collection()
    # for i in test_col.find():
    #     if "file_id" in i:
    #         print(i["file_id"])
    #     test_col.update_one({"_id": i["_id"]}, {"$set": {"alexa": "12"}})
    test = test_col.find({"file_id": 1, "value.test1": {"$exists": True}})

    for item in test:
        print(item["_id"])
Ejemplo n.º 6
0
    def update_t_mapping_rule(self, repo_id, create_id):
        #显然是要对每个类目进行计算
        return_category = TCategory.objects.filter(repo_id=repo_id,
                                                   create_id=create_id)

        category_name_list = []
        category_id_list = []
        for val in return_category:
            val_dict = model_to_dict(val)
            category_name_list.append(val_dict['category_name'])
            category_id_list.append(val_dict['id'])

        list_len = len(category_name_list)
        #print(list_len)
        #print(category_name_list)
        #print(category_id_list)
        for i in range(0, list_len):
            tmp_id = category_id_list[i]
            attribute_name_alias_map = {}
            return_attribute = TAttribute.objects.filter(category_id=tmp_id)

            #把这个函数封装一下不然就太长了
            #输入是查询的所有attribute 返回一个map
            #attribute 不仅是自己的attribute 还有父亲节点的attribute
            #这边这个attribute_name_alias_map里面还要放进去他们夫妻节点的东西
            attribute_name_alias_map = self.return_attribute_name_map(
                return_attribute, attribute_name_alias_map)
            #print(attribute_name_alias_map)

            ret_cate = TCategory.objects.get(id=tmp_id)
            ret_cate_dict = model_to_dict(ret_cate)
            father_category_id = ret_cate_dict['father_category_id']
            #print(father_category_id,type(father_category_id))
            if (str(-1) != father_category_id):
                return_attribute_father = TAttribute.objects.filter(
                    category_id=father_category_id)
                attribute_name_alias_map = self.return_attribute_name_map(
                    return_attribute_father, attribute_name_alias_map)
                ret_cate_father = TCategory.objects.get(id=father_category_id)
                ret_cate_dict_father = model_to_dict(ret_cate_father)
                father_father_category_id = ret_cate_dict_father[
                    'father_category_id']
                #print(father_father_category_id)
                if (str(-1) != father_father_category_id):
                    return_attribute_father_father = TAttribute.objects.filter(
                        category_id=father_father_category_id)
                    attribute_name_alias_map = self.return_attribute_name_map(
                        return_attribute_father_father,
                        attribute_name_alias_map)

            #print(attribute_name_alias_map)

            news_col = Mongodb(db='knowledge',
                               collection='text').get_collection()
            #print(list_len)

            # 这边所有的名字已经在attribute_name_alias_map里面
            _insert_mapping_rule_attribute_name_list = []
            _insert_mapping_rule_attribute_coverage_rate_list = []
            category_id = category_id_list[i]
            #从mongodb里面找
            attribute_name_map = {}
            tmp_list = news_col.find({'category_id': category_id})
            num = 0
            for val in tmp_list:
                #print(val)
                num += 1
                if val is not None:
                    for key in val.keys():
                        #print(key)
                        #print(attribute_name_map)
                        #print(key in attribute_name_map )
                        if (key == '_id' or key == 'file_id'
                                or key == 'category_id'):
                            continue
                        elif (key in attribute_name_map):

                            attribute_name_map[key] += 1
                        else:
                            attribute_name_map[key] = 1
            #这边这个在没有实体的时候 假如说t_mapping_rule里面有多余的值那么就要进行更新
            #所以只要有一个就不用删除一旦一个都没有了 那么就不用删除
            #如果你想写得再细致一点那么就确认这个属性还在 假如说不在了那么就删除
            #
            delete_id_list = []
            #print(111)

            return_mapping_rule = TMappingRule.objects.filter(
                category_id=category_id, create_id=create_id)
            #print(111)
            for rule in return_mapping_rule:

                rule_dict = model_to_dict(rule)
                print(rule_dict)
                rule_dict_id = rule_dict['id']
                rule_dict_attribute_name = rule_dict['attribute_name']
                if rule_dict_attribute_name not in attribute_name_map.keys():
                    delete_id_list.append(rule_dict_id)
            for mapping_rule_id in delete_id_list:
                #其实删除的话最好存到日志里面不然又会出问题
                rule_mapping = TMappingRule.objects.get(id=mapping_rule_id)
                rule_mapping.delete()
            #
            #print(attribute_name_map)
            if attribute_name_map is not None:
                for key in attribute_name_map.keys():
                    if (key in attribute_name_alias_map):
                        a = 1
                    else:
                        _insert_mapping_rule_attribute_name_list.append(key)
                        coverage_rate = 1.0 * attribute_name_map[key] / num
                        _insert_mapping_rule_attribute_coverage_rate_list.append(
                            coverage_rate)
            attribute_name_list_len = len(
                _insert_mapping_rule_attribute_name_list)
            dt = datetime.now()
            #print(attribute_name_list_len)
            #print(_insert_mapping_rule_attribute_name_list)
            for k in range(0, attribute_name_list_len):
                attribute_name_val = _insert_mapping_rule_attribute_name_list[
                    k]
                attribute_coverage_val = _insert_mapping_rule_attribute_coverage_rate_list[
                    k]
                obj = TMappingRule.objects.filter(
                    attribute_name=attribute_name_val,
                    create_id=create_id).first()
                #print(attribute_name_val,attribute_coverage_val)
                if (obj is None):
                    # create
                    TMappingRule.objects.create(
                        attribute_name=attribute_name_val,
                        coverage_rate=attribute_coverage_val,
                        create_time=str(dt)[:19],
                        category_id=category_id,
                        create_id=create_id)
                else:
                    # upadte
                    obj.coverage_rate = attribute_coverage_val
                    obj.create_time = str(dt)[:19]
                    obj.save()
        return 1
Ejemplo n.º 7
0
class MaoyanSpider(Driver):
    def __init__(self,
                 isheadless=False,
                 ismobile=False,
                 isvirtualdisplay=False,
                 isloadimages=True,
                 isproxy=False,
                 spider_id='2'):
        Driver.__init__(self,
                        log_file_name=spider_id,
                        ismobile=ismobile,
                        isvirtualdisplay=isvirtualdisplay,
                        isheadless=isheadless,
                        isloadimages=isloadimages,
                        isproxy=isproxy)
        self.boxoffice_col = Mongodb(db='knowledge',
                                     collection='text').get_collection()
        self.news_col = Mongodb(db='movies1',
                                collection='news').get_collection()

    @staticmethod
    def find_key_from_value(dict, value):
        key_list = dict.keys()
        for key in key_list:
            if value == dict[key]:
                return key
        return None

    def get_boxoffice_infos_from_one_page(self,
                                          url="",
                                          datetime="",
                                          user_id=-1,
                                          repo_id=-1):
        """
        获取猫眼此时刻票房数据
        :param repo_id:
        :param user_id:
        :param datetime:
        :param url:
        :return:
        """
        self.fast_new_page(url=url)
        time.sleep(1)
        if not self.judge_web_element_exist_by_css_selector(
                css_selector="div.dashboard-content"):
            self.close_curr_page()
            return True
        theads = self.until_presence_of_all_elements_located_by_css_selector(
            css_selector=
            "div.dashboard-list > table.dashboard-table.table-header > thead > tr > th"
        )[1:]
        theads = [item.text for item in theads]
        if not self.judge_web_element_exist_by_css_selector(
                css_selector=
                "div.movielist-container > div.movielist > table.dashboard-table > tbody > tr"
        ):
            self.close_curr_page()
            return False
        boxoffice_infos = self.until_presence_of_all_elements_located_by_css_selector(
            css_selector=
            "div.movielist-container > div.movielist > table.dashboard-table > tbody > tr"
        )
        crwal_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        boxoffice_data_from_the_page = []
        for item in boxoffice_infos:
            one_boxoffice_data = {}
            boxoffice_info = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector="td", ele=item)
            movie_name = self.until_presence_of_element_located_by_css_selector(
                css_selector="div > div.moviename-desc > p.moviename-name",
                ele=boxoffice_info[0])
            movie_info = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector=
                "div > div.moviename-desc > p.moviename-info > span",
                ele=boxoffice_info[0])
            one_boxoffice_data.setdefault("日期", datetime)
            one_boxoffice_data.setdefault("电影名", movie_name.text)
            one_boxoffice_data.setdefault("上映时间", movie_info[0].text)
            one_boxoffice_data.setdefault("总票房", movie_info[1].text)
            boxoffice_info = boxoffice_info[1:]
            for i in range(len(boxoffice_info)):
                one_boxoffice_data.setdefault(theads[i],
                                              boxoffice_info[i].text)
            one_boxoffice_data.setdefault("crawl_time", crwal_time)
            one_boxoffice_data.setdefault("crawl_from", "猫眼专业版")
            # self.piaofang_col.insert_one(one_piaofang_data)
            judge_result = self.judge_data_exist_by_keys(
                collection=self.boxoffice_col,
                keys={
                    "user_id": user_id,
                    "repo_id": repo_id,
                    "value.日期": one_boxoffice_data["日期"],
                    "value.电影名": one_boxoffice_data["电影名"],
                    "value.crawl_from": one_boxoffice_data["crawl_from"]
                })
            if judge_result is True:
                boxoffice_data_from_the_page.append(one_boxoffice_data)
            else:
                return boxoffice_data_from_the_page, False

        self.close_curr_page()
        return boxoffice_data_from_the_page, True

    def get_boxoffice_infos(self, spider_id, user_id, repo_id, spider_name):
        date = datetime.datetime.strptime("2020-01-23", '%Y-%m-%d')
        # date = datetime.datetime.now()
        final_result = []
        while True:
            data_list, result = self.get_boxoffice_infos_from_one_page(
                url="http://piaofang.maoyan.com/dashboard/movie?date=" +
                str(date)[:10],
                datetime=str(date)[:10],
                user_id=int(user_id),
                repo_id=int(repo_id))
            final_result.extend(data_list)
            if result is False:
                break
            date = date + datetime.timedelta(days=-1)
        if len(final_result) == 0:
            return
        one_data_acquisition_log = TDataAcquisitionLog.objects.create(
            create_time=timezone.now(),
            data_source_name=spider_name,
            data_access="爬虫",
            repo_id=int(repo_id),
            create_id=int(user_id),
            data_path="")
        TEntityExtractionLog.objects.create(
            data_acquisition_id=one_data_acquisition_log.id,
            is_extract=0,
            entity_number=0,
            extract_time=timezone.now(),
            create_id=int(user_id),
            repo_id=int(repo_id))

        for item in final_result:
            self.boxoffice_col.insert_one({
                "file_id": one_data_acquisition_log.id,
                "category_id": -1,
                "spider_id": int(spider_id),
                "user_id": int(user_id),
                "repo_id": int(repo_id),
                "value": item
            })

    def run_spider(self, url=""):
        lastest_info = self.boxoffice_col.find().sort("datetime", -1).limit(1)
        date = datetime.datetime.strptime(lastest_info[0]["datetime"],
                                          '%Y-%m-%d')
        date = date + datetime.timedelta(days=1)
        now = datetime.datetime.now()
        while date < now:
            self.get_boxoffice_infos_from_one_page(
                "http://piaofang.maoyan.com/dashboard/movie?date=" +
                str(date)[:10],
                str(date)[:10])
            date = date + datetime.timedelta(days=1)
Ejemplo n.º 8
0
    def eventExtraction(self, request, file_id, lEventCategoryId):
        """
        功能 进行模板匹配的事件抽取
        :param request:                          request参数
        :param file_id:             数据类型str  文件id
        :param lEventCategoryId:    数据类型list 事件类目id
        :return: True
        """
        #加入ruleId 1或者2
        #1的事件是三元组主谓宾 2的话变事件是主谓
        #only for debug
        #request.session['user_id'] = 1
        #request.session['repo_id'] = 1
        #fileId = 13
        #only for debug

        #fileId = request.POST['fileId']
        #request.session['repo_id']=1
        #request.session['user_id']=1
        repoId = request.session['repo_id']
        createId = request.session['user_id']

        #存到这个file_id 里面
        tmp_info = {'file_id': file_id, 'user_id': createId, 'repo_id': repoId}
        news_col = Mongodb(db='knowledge', collection='text').get_collection()
        cnt = 1
        ret_entity = news_col.find(tmp_info)
        ret_entity_map = list()
        for item in ret_entity:
            if "内容" in item["value"]:
                ret_entity_map.append(item)

        if len(ret_entity_map) == 0:
            return

        print("--------------------事件抽取")
        #在这个之前把所有的词语都加进去
        #整个循环都是为了把这个repoId的所有的触发词以及他们的事件主题客体都加入进去
        retTriggerWordList = TTriggerWord.objects.filter(repo_id=repoId)
        eventLabelList = []
        # hanlpUnit=HanlpUnit()
        #这边要修  我们要从事类目开始查询
        for i in retTriggerWordList:
            tmpLableList = []
            ruleId = 1
            retTriggerWordDict = model_to_dict(i)
            triggerId = retTriggerWordDict['id']
            eventId = retTriggerWordDict['event_rule_id']
            #print(111,eventId)
            #触发词名字和触发词标注
            retEventRule = TEventRule.objects.get(id=eventId)
            #print(333,retEventRule.category_id)
            retCategoryName = TCategory.objects.get(
                id=retEventRule.category_id).category_name
            #print(444,retCategoryName)
            #这里的时候触发词的label要变成事件的label
            #到时候改一下
            triggerWord = retTriggerWordDict['trigger_word']
            triggerWordId = BaseController.get_category_name(
                request, retCategoryName)
            #print(222,eventId)

            eventRule = TEventRule.objects.get(id=eventId, repo_id=repoId)
            eventRuleDict = model_to_dict(eventRule)
            eventCategoryId = eventRuleDict['category_id']
            if (eventCategoryId not in lEventCategoryId):
                continue
            eventCategory = TCategory.objects.get(id=eventCategoryId,
                                                  repo_id=repoId,
                                                  create_id=createId)
            eventCategoryDict = model_to_dict(eventCategory)
            eventCategoryName = eventCategoryDict['category_name']
            tmpLableList.append(eventCategoryName)
            #事件类目

            subjectCategoryId = eventRuleDict['event_subject_id']
            subjectCategory = TCategory.objects.get(id=subjectCategoryId,
                                                    repo_id=repoId,
                                                    create_id=createId)
            subjectCategoryDict = model_to_dict(subjectCategory)
            subjectCategoryName = subjectCategoryDict['category_name']
            subjectId = BaseController.get_category_name(
                request, subjectCategoryName)
            tmpLableList.append(subjectId)
            retListId, retListVal = some_data_deal_func(
            ).inputCategoryIdReturnName(subjectCategoryId, repoId, createId)
            #对于retListVal里面的所有的值都把他们加入到分词器中然后进行分词
            #构造wordList word 和mask 对应
            constructWordList = []
            tmpSet = self.hanlp_tool.added_word_list
            #print(len(retListVal ))
            for word in retListVal:
                if (word == None):
                    continue
                tmpDict = {}
                tmpDict['word'] = word
                #print(word)
                #item["word"], item["mask"]
                tmpDict['mask'] = subjectId
                constructWordList.append(tmpDict)

            #这边这个要加入list[{'word':123,mask:13}]
            self.hanlp_tool.add_word_list(constructWordList)
            #print(constructWordList)
            objectCategoryId = eventRuleDict['event_object_id']
            negativeOne = -1
            if (objectCategoryId == negativeOne):
                ruleId = 2

            constructWordList = []
            tmpDict = {}
            tmpDict['word'] = triggerWord
            tmpDict['mask'] = str(triggerWordId)
            tmpSet = self.hanlp_tool.added_word_list
            constructWordList.append(tmpDict)
            self.hanlp_tool.add_word_list(constructWordList)
            tmpLableList.append(str(triggerWordId))
            print(ruleId)
            if (ruleId == 1):
                objectCategoryId = eventRuleDict['event_object_id']
                objectCategory = TCategory.objects.get(id=objectCategoryId,
                                                       repo_id=repoId,
                                                       create_id=createId)
                objectCategoryDict = model_to_dict(objectCategory)
                objectCategoryName = objectCategoryDict['category_name']
                objectId = BaseController.get_category_name(
                    request, objectCategoryName)
                retListId, retListVal = some_data_deal_func(
                ).inputCategoryIdReturnName(objectCategoryId, repoId, createId)
                tmpLableList.append(objectId)
                constructWordList = []
                tmpSet = self.hanlp_tool.added_word_list
                #这个代码有变动需要改一下
                for word in retListVal:
                    if (word == None):
                        continue
                    tmpDict = {}
                    tmpDict['word'] = word
                    # item["word"], item["mask"]
                    tmpDict['mask'] = str(objectId)
                    constructWordList.append(tmpDict)
                # 这边这个要加入list[{'word':123,mask:13}]
                #print(constructWordList)
                self.hanlp_tool.add_word_list(constructWordList)

            eventLabelList.append(tmpLableList)

        #eventLabelList
        #事件类目 事件主题  事件触发词 事件客体
        #print(eventLabelList)
        # print("list里面内容")
        # tmpS=self.hanlp_tool.added_word_list
        # for name in tmpS:
        #     print(name)
        #print("list里面内容结束")
        #return True
        #name
        attribute = TAttribute.objects.get(category_id=1)
        attributeDict = model_to_dict(attribute)
        attributeName = attributeDict['attribute_name']
        #print(self.hanlp_tool.added_word_list)
        cnt = 1
        for i in ret_entity_map:
            _id = i['_id']
            #根据这个id放回去就好了
            value = i['value']
            basetime = str(value['时间'])
            content = value['内容']
            text = HanlpUnit().get_text_from_html(content)
            sentenceList = self.hanlp_tool.split_paragraph(text)
            #print(sentenceList)
            #这边把所有的东西都拿出来
            event_extract_result = []
            count = 0
            countIndex = 0
            #时间 地点 事件主体 事件客体 主体的类目 和客体的类目
            tmpEventSet = set()
            for sent in sentenceList:
                sent = sent.strip()
                #print(sent)
                #对每一个sent进行分词获取他们的事件
                #11111
                #sent="浙江杭州明天林更新出演动作喜剧《快手枪手快枪手》"
                sentenceDealResult = self.hanlp_tool.cut(sent)
                event = self.eventExtractionByTemplateMatching(
                    sent, eventLabelList)
                #事件抽取完成
                #dateTime还要调整一下basetime会出问题
                #print(basetime)

                dateTime = basetime
                timeIndex = -1
                #print(123,timeIndex)
                timeIndex, timeWord, dateTime = Time_deal().dealTime(
                    sent, basetime)
                if (timeIndex != -1):
                    timeIndex = timeIndex + countIndex
                #print(46, timeIndex)
                #print(11111111,dateTime)

                locationList = Time_deal().dealArea(sent)
                location = ''
                locationindex = -1
                for val in locationList:
                    if (len(val['place']) > len(location)):
                        location = val['place']
                        locationindex = val['index'] + countIndex
                #print(location,locationindex)
                countIndex += len(sentenceDealResult)

                #这三个的名字需要和事件一起返回
                #print(event)
                for eve in event:
                    ruleId = 1
                    if (len(eve) == 3):
                        ruleId = 2
                    eveId = eve[0]
                    subjectLabel = eventLabelList[eveId][1]
                    #triggerLabel = BaseController.get_category_name()eventLabelList[eveId][0]

                    attribute = {}
                    attribute['发生时间'] = dateTime
                    attribute['地点'] = location
                    eveString = ''

                    for j in range(1, len(eve), 1):
                        eveString = eveString + str(eve[j])
                    attribute['名字'] = eveString
                    #eventlabel要通过查询结果得到
                    eventLabel = BaseController.get_category_name(
                        request, eventLabelList[eveId][0])
                    #print(eventLabel)
                    #print(eventLabelList[eveId])
                    #print(event)
                    subjectLabel = eventLabelList[eveId][1]

                    Neo4j().create_node_mjy_edition(eventLabel, attribute)
                    subjectNameVal = eve[1]
                    # print(subjectCategoryName,attributeName,subjectNameVal)
                    neo4jSubjectId = Neo4j().quesIdByLabelAttribute(
                        subjectLabel, attributeName,
                        '\'' + subjectNameVal + '\'')
                    neo4jEventId = Neo4j().quesIdByLabelAttribute(
                        eventLabel, '名字', '\'' + eveString + '\'')
                    Neo4j().createRelationship(subjectLabel, eventLabel,
                                               "主谓关系", {'id': neo4jSubjectId},
                                               {'id': neo4jEventId})
                    if (ruleId == 1):
                        objectNameVal = eve[3]
                        objectLabel = eventLabelList[eveId][3]
                        neo4jObjectId = Neo4j().quesIdByLabelAttribute(
                            objectLabel, attributeName,
                            '\'' + objectNameVal + '\'')
                        Neo4j().createRelationship(eventLabel, objectLabel,
                                                   "动宾关系",
                                                   {'id': neo4jEventId},
                                                   {'id': neo4jObjectId})
                        #print(neo4jSubjectId, neo4jEventId, neo4jObjectId)
                    tmpEventDict = {}
                    tmpEventDict['actual_event_time'] = dateTime
                    #事件抽取内容拿出来
                    tmpEventDict['time'] = timeWord
                    tmpEventDict['timeIndex'] = timeIndex
                    tmpEventDict['location'] = location
                    tmpEventDict['locationIndex'] = locationindex
                    #print(111,dateTime,location)
                    tmpEventDict['eventSubject'] = eve[1]
                    tmpEventDict['eventSubjectLabel'] = subjectLabel
                    tmpEventDict['triggerLabel'] = eventLabel
                    tmpEventDict['triggerWord'] = eve[2]
                    tmpEventDict['eventName'] = eveString
                    if (ruleId == 1):
                        tmpEventDict['eventObject'] = eve[3]
                        objectLabel = eventLabelList[eveId][3]
                        tmpEventDict['eventObjectLabel'] = objectLabel
                    if (eveString not in tmpEventSet):
                        tmpEventSet.add(eveString)
                        event_extract_result.append(tmpEventDict)
                    print(tmpEventDict)
                    count += 1
            #插入到mongodb
            #print(count,event_extract_result)
            news_col.update_one(
                {'_id': _id},
                {"$set": {
                    'event_extract_result': event_extract_result
                }})
            #news_col.insert_one()
            cnt += 1
            #if(cnt>=2):
            #     break
        return True
Ejemplo n.º 9
0
    def extract_relationship_from_unstructured_data(
            self, request, file_id, relationship_attribute_list=None):
        """
        从非结构化数据中抽取关系
        :param file_id:文件id,获取mongodb中对应要分析的数据
        :param relationship_attribute_list:关系属性列表,所有使用该算法的关系属性id集合
        :param request:
        :return:
        """
        print("------------------------非结构关系抽取")
        tmp_info = {
            'file_id': file_id,
            'user_id': request.session["user_id"],
            'repo_id': request.session["repo_id"]
        }
        collection = Mongodb(db='knowledge',
                             collection='text').get_collection()
        ret_entity = collection.find(tmp_info)
        ret_entity_map = list()
        for item in ret_entity:
            if "内容" in item["value"]:
                ret_entity_map.append(item)

        if len(ret_entity_map) == 0 or relationship_attribute_list is None:
            print("无可抽取内容")
            return
        relationship_list = []
        # all_category = TCategory.objects.filter(repo_id=request.session["repo_id"], create_id=request.session["user_id"], category_type=1)
        added_category_id = set()
        for attribute_id in relationship_attribute_list:
            cur_attribute = TAttribute.objects.get(id=attribute_id)
            category_from = TCategory.objects.get(id=cur_attribute.category_id)
            data_type = TDataType.objects.get(id=cur_attribute)
            category_to = TCategory.objects.get(id=data_type.category_id)

            category_from_name = BaseController.get_category_name(
                request, category_from.category_name)
            category_to_name = BaseController.get_category_name(
                request, category_to.category_name)

            one_relationship = list()
            one_relationship.append(cur_attribute.attribute_name)
            one_relationship.append(category_from_name)
            one_relationship.append(
                BaseController.get_category_name(request,
                                                 cur_attribute.attribute_name))
            one_relationship.append(category_to_name)
            relationship_list.append(one_relationship)
            self.hanlp_tool.add_word_list([{
                "word":
                alia_item.attribute_alias,
                "mask":
                BaseController.get_category_name(request,
                                                 cur_attribute.attribute_name)
            } for alia_item in TAttrbuteAlias.objects.filter(
                attribute_id=cur_attribute.id)])
            if category_from.id not in added_category_id:
                ret_list_id, ret_list_val = some_data_deal_func(
                ).inputCategoryIdReturnName(
                    categoryId=category_from.id,
                    repoId=request.session["repo_id"],
                    createId=request.session["user_id"])
                self.hanlp_tool.add_word_list([{
                    "word": val_item,
                    "mask": category_from_name
                } for val_item in ret_list_val])
                added_category_id.add(category_from.id)
            if category_to.id not in added_category_id:
                ret_list_id, ret_list_val = some_data_deal_func(
                ).inputCategoryIdReturnName(
                    categoryId=category_to.id,
                    repoId=request.session["repo_id"],
                    createId=request.session["user_id"])
                self.hanlp_tool.add_word_list([{
                    "word": val_item,
                    "mask": category_to_name
                } for val_item in ret_list_val])
                added_category_id.add(category_to.id)

        # for category_item in all_category:
        #     try:
        #         one_data_type = TDataType.objects.get(category_id=category_item.id, repo_id=request.session["repo_id"], create_id=request.session["user_id"])
        #         attribute_list = TAttribute.objects.filter(data_type_id=one_data_type.id)
        #         category_to_name = BaseController.get_category_name(request, category_item.category_name)
        #         for attribute_item in attribute_list:
        #             category_from = TCategory.objects.get(id=attribute_item.category_id)
        #             category_from_name = BaseController.get_category_name(request, category_from.category_name)
        #             one_relationship = list()
        #             one_relationship.append(attribute_item.attribute_name)
        #             one_relationship.append(category_from_name)
        #             one_relationship.append(BaseController.get_category_name(request, attribute_item.attribute_name))
        #             one_relationship.append(category_to_name)
        #             relationship_list.append(one_relationship)
        #             self.hanlp_tool.add_word_list([{"word": alia_item.attribute_alias,
        #                                            "mask": BaseController.get_category_name(request,
        #                                                                                     attribute_item.attribute_name)}
        #                                           for alia_item in
        #                                           TAttrbuteAlias.objects.filter(attribute_id=attribute_item.id)])
        #             print([{"word": alia_item.attribute_alias,
        #                                            "mask": BaseController.get_category_name(request,
        #                                                                                     attribute_item.attribute_name)}
        #                                           for alia_item in
        #                                           TAttrbuteAlias.objects.filter(attribute_id=attribute_item.id)])
        #             if category_from.id not in added_category_id:
        #                 ret_list_id, ret_list_val = some_data_deal_func().inputCategoryIdReturnName(categoryId=category_from.id, repoId=request.session["repo_id"], createId=request.session["user_id"])
        #                 self.hanlp_tool.add_word_list([{"word": val_item, "mask": category_from_name} for val_item in ret_list_val])
        #                 added_category_id.add(category_from.id)
        #         if category_item.id not in added_category_id:
        #             ret_list_id, ret_list_val = some_data_deal_func().inputCategoryIdReturnName(
        #                 categoryId=category_item.id, repoId=request.session["repo_id"],
        #                 createId=request.session["user_id"])
        #             self.hanlp_tool.add_word_list(
        #                 [{"word": val_item, "mask": category_to_name} for val_item in ret_list_val])
        #             added_category_id.add(category_item.id)
        #     except ObjectDoesNotExist:
        #         continue
        neo4j = Neo4j()
        cout = 0
        for i in ret_entity_map:
            _id = i['_id']
            value = i['value']
            content = value['内容']
            text = HanlpUnit().get_text_from_html(content)

            sentenceList = self.hanlp_tool.split_paragraph(text)
            extract_relationship = []
            for sent in sentenceList:
                sent = sent.strip()

                relationships = self.eventExtractionByTemplateMatching(
                    sent, relationship_list)
                # relationships = self.eventExtractionByTemplateMatching(text.strip(), relationship_list)
                for item in relationships:
                    relation_id = item[0]
                    cur_relationship = relationship_list[relation_id]

                    extract_relationship.append({
                        "object_from_category":
                        cur_relationship[1],
                        "object_to_category":
                        cur_relationship[3],
                        "object_from_name":
                        item[1],
                        "object_relationship_name":
                        item[2],
                        "object_to_name":
                        item[3]
                    })
                    object1 = neo4j.match(
                        object_from={
                            "label_name": cur_relationship[1],
                            "content": {
                                "名字": item[1]
                            }
                        })
                    object2 = neo4j.match(
                        object_from={
                            "label_name": cur_relationship[3],
                            "content": {
                                "名字": item[3]
                            }
                        })
                    if object1 is not None and len(
                            object1) == 1 and object2 is not None and len(
                                object2) == 1:
                        neo4j.createRelationship(labelOne=cur_relationship[1],
                                                 labelTwo=cur_relationship[3],
                                                 relationShipName=item[2],
                                                 propertyOne={"名字": item[1]},
                                                 propertyTwo={"名字": item[3]})
            if "relationship_extract_result" in i:
                extract_relationship = self.merge_list(
                    extract_relationship, i["relationship_extract_result"])
            cout += 1
            print(
                str(cout) + "个文章" + ",抽取数量:" + str(len(extract_relationship)))
            collection.update_one({"_id": ObjectId(_id)}, {
                "$set": {
                    "relationship_extract_result": extract_relationship
                }
            })