def return_data_from_mongodb(self,request): try: repo_id = request.session['repo_id'] #repo_id=1 except Exception: return self.error('没有知识库id') try: file_id = request.POST['file_id'] #file_id = 1 except Exception: return self.error('没有文件id') tmp_info = {'file_id': file_id} try: news_col = Mongodb(db='knowledge', collection='text').get_collection() except Exception: return self.error("mongodb没有数据库或者表") ret_entity_map = news_col.find(tmp_info) ret_list = [] for val in ret_entity_map: ret_list.append(val) category_name_list = [] ret_category = TCategory.objects.filter(repo_id=repo_id) for val in ret_category: val_dict = model_to_dict(val) category_name_list.append(val_dict['category_name']) ret_l = {'category_name': category_name_list, 'context': ret_list} print(ret_l) return render(request, 'test1.html', context=ret_l)
def save_mongodb_data_to_neo4j(self,request): try: #entity_id = request.POST['entity_id'] entity_id = ObjectId("5eb52fc9d03fe5b0f31b6f40") except Exception: return self.error("没有收到entity_id") try: #category_id = request.POST['category_id'] category_id=1 except Exception: return self.error("没有收到category_id") try: news_col = Mongodb(db='knowledge', collection='text').get_collection() except Exception: return self.error("mongodb没有数据库或者表") category_val = TCategory.objects.get(id=category_id) category_val_dict = model_to_dict(category_val) category_name = category_val_dict['category_name'] tmp_data={'_id':entity_id} ret_entity = news_col.find(tmp_data) for val in ret_entity: print(category_name,val,1) Neo4j().create_node_mjy_edition(category_name,val) ret_l = {} return render(request, 'test1.html', context=ret_l)
def get_data_source(): """ 获取已获取的电影人url :return: """ member_col = Mongodb(db='movies', collection='member').get_collection() url_set = set() for item in member_col.find(): url_set.add(item["douban_url"]) return url_set
def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False, spider_id='2'): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy) self.collection = Mongodb(db='knowledge', collection='text').get_collection()
def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False, proxy_ip_from="", spider_id='2'): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy, proxy_ip_from=proxy_ip_from) # self.baike_col = Mongodb(db='movies1', collection="baike_member").get_collection() self.baike_col = Mongodb(db='baike', collection="test1").get_collection()
def save_data_to_mongodb(self,request): #新建类目表 #数据的话从文件里面读出 repo_id = request.POST['repo_id'] #create_id = request.POST['create_id'] file_id = request.POST['file_id'] try: news_col = Mongodb(db='knowledge', collection='text').get_collection() except Exception: return self.error("mongodb没有数据库或者表") try: ret_file_data = TDataAcquisitionLog.objects.get(id=file_id) except Exception: return self.error("id没有对应文件") ret_file_data_dict = model_to_dict(ret_file_data) file_name = ret_file_data_dict['data_source'] path_str = ret_file_data_dict['data_access'] try: data = xlrd.open_workbook(path_str + file_name) except Exception: return self.error("没有找到对应文件") table_name = data.sheet_names()[0] table = data.sheet_by_name(table_name) list_attribute = list(table.row_values(0)) list_json = [] row = table.nrows col = table.ncols for i in range(1, row): dict_data = {} for j in range(0, col): dict_data[list_attribute[j]] = table.row_values(i)[j] dict_data['file_id']=file_id x=news_col.insert_one(dict_data) ret_l={'context':'success'} return render(request, 'test1.html', context=ret_l)
def GetStatistics(spider_id, repo_id): collection = Mongodb(db='knowledge', collection='text').get_collection() count = collection.find({ "spider_id": spider_id, "repo_id": repo_id }).count() # comment_count = comments_collection.find({FieldName.DATA_WEBSITE: str(project.data_website), FieldName.DATA_REGION: str(project.data_region), FieldName.DATA_SOURCE: str(project.data_source)}).count() # try: # predict_comment_count = shops_collection.aggregate([{'$match': {FieldName.DATA_WEBSITE: str(project.data_website), FieldName.DATA_REGION: str(project.data_region), FieldName.DATA_SOURCE: str(project.data_source), FieldName.SHOP_COMMENT_NUM: {"$gt": 0}}}, {'$group': {"_id": "$%s"%FieldName.SHOP_URL, "num": {"$first": "$%s" % FieldName.SHOP_COMMENT_NUM}}}, {'$group': {"_id": None, "sum": {"$sum": "$num"}}}]).next().get('sum') # except Exception: # predict_comment_count = 0 curr_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) count_today = collection.find({ "spider_id": spider_id, "repo_id": repo_id, "value.crawl_time": { '$regex': curr_date } }).count() week_start = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time() - 7 * 24 * 3600)) count_week = collection.find({ "spider_id": spider_id, "repo_id": repo_id, "value.crawl_time": { '$gt': week_start } }).count() month_start = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time() - 30 * 24 * 3600)) count_month = collection.find({ "spider_id": spider_id, "repo_id": repo_id, "value.crawl_time": { '$gt': month_start } }).count() result = '数据:%6s条 今日:%6s条 本周:%6s条 本月:%s条' % (count, count_today, count_week, count_month) return result
from model.mongodb import Mongodb from bson import ObjectId if __name__ == '__main__': test_col = Mongodb(db='test', collection='test').get_collection() # for i in test_col.find(): # if "file_id" in i: # print(i["file_id"]) # test_col.update_one({"_id": i["_id"]}, {"$set": {"alexa": "12"}}) test = test_col.find({"file_id": 1, "value.test1": {"$exists": True}}) for item in test: print(item["_id"])
def extract_relationship_from_unstructured_data( self, request, file_id, relationship_attribute_list=None): """ 从非结构化数据中抽取关系 :param file_id:文件id,获取mongodb中对应要分析的数据 :param relationship_attribute_list:关系属性列表,所有使用该算法的关系属性id集合 :param request: :return: """ print("------------------------非结构关系抽取") tmp_info = { 'file_id': file_id, 'user_id': request.session["user_id"], 'repo_id': request.session["repo_id"] } collection = Mongodb(db='knowledge', collection='text').get_collection() ret_entity = collection.find(tmp_info) ret_entity_map = list() for item in ret_entity: if "内容" in item["value"]: ret_entity_map.append(item) if len(ret_entity_map) == 0 or relationship_attribute_list is None: print("无可抽取内容") return relationship_list = [] # all_category = TCategory.objects.filter(repo_id=request.session["repo_id"], create_id=request.session["user_id"], category_type=1) added_category_id = set() for attribute_id in relationship_attribute_list: cur_attribute = TAttribute.objects.get(id=attribute_id) category_from = TCategory.objects.get(id=cur_attribute.category_id) data_type = TDataType.objects.get(id=cur_attribute) category_to = TCategory.objects.get(id=data_type.category_id) category_from_name = BaseController.get_category_name( request, category_from.category_name) category_to_name = BaseController.get_category_name( request, category_to.category_name) one_relationship = list() one_relationship.append(cur_attribute.attribute_name) one_relationship.append(category_from_name) one_relationship.append( BaseController.get_category_name(request, cur_attribute.attribute_name)) one_relationship.append(category_to_name) relationship_list.append(one_relationship) self.hanlp_tool.add_word_list([{ "word": alia_item.attribute_alias, "mask": BaseController.get_category_name(request, cur_attribute.attribute_name) } for alia_item in TAttrbuteAlias.objects.filter( attribute_id=cur_attribute.id)]) if category_from.id not in added_category_id: ret_list_id, ret_list_val = some_data_deal_func( ).inputCategoryIdReturnName( categoryId=category_from.id, repoId=request.session["repo_id"], createId=request.session["user_id"]) self.hanlp_tool.add_word_list([{ "word": val_item, "mask": category_from_name } for val_item in ret_list_val]) added_category_id.add(category_from.id) if category_to.id not in added_category_id: ret_list_id, ret_list_val = some_data_deal_func( ).inputCategoryIdReturnName( categoryId=category_to.id, repoId=request.session["repo_id"], createId=request.session["user_id"]) self.hanlp_tool.add_word_list([{ "word": val_item, "mask": category_to_name } for val_item in ret_list_val]) added_category_id.add(category_to.id) # for category_item in all_category: # try: # one_data_type = TDataType.objects.get(category_id=category_item.id, repo_id=request.session["repo_id"], create_id=request.session["user_id"]) # attribute_list = TAttribute.objects.filter(data_type_id=one_data_type.id) # category_to_name = BaseController.get_category_name(request, category_item.category_name) # for attribute_item in attribute_list: # category_from = TCategory.objects.get(id=attribute_item.category_id) # category_from_name = BaseController.get_category_name(request, category_from.category_name) # one_relationship = list() # one_relationship.append(attribute_item.attribute_name) # one_relationship.append(category_from_name) # one_relationship.append(BaseController.get_category_name(request, attribute_item.attribute_name)) # one_relationship.append(category_to_name) # relationship_list.append(one_relationship) # self.hanlp_tool.add_word_list([{"word": alia_item.attribute_alias, # "mask": BaseController.get_category_name(request, # attribute_item.attribute_name)} # for alia_item in # TAttrbuteAlias.objects.filter(attribute_id=attribute_item.id)]) # print([{"word": alia_item.attribute_alias, # "mask": BaseController.get_category_name(request, # attribute_item.attribute_name)} # for alia_item in # TAttrbuteAlias.objects.filter(attribute_id=attribute_item.id)]) # if category_from.id not in added_category_id: # ret_list_id, ret_list_val = some_data_deal_func().inputCategoryIdReturnName(categoryId=category_from.id, repoId=request.session["repo_id"], createId=request.session["user_id"]) # self.hanlp_tool.add_word_list([{"word": val_item, "mask": category_from_name} for val_item in ret_list_val]) # added_category_id.add(category_from.id) # if category_item.id not in added_category_id: # ret_list_id, ret_list_val = some_data_deal_func().inputCategoryIdReturnName( # categoryId=category_item.id, repoId=request.session["repo_id"], # createId=request.session["user_id"]) # self.hanlp_tool.add_word_list( # [{"word": val_item, "mask": category_to_name} for val_item in ret_list_val]) # added_category_id.add(category_item.id) # except ObjectDoesNotExist: # continue neo4j = Neo4j() cout = 0 for i in ret_entity_map: _id = i['_id'] value = i['value'] content = value['内容'] text = HanlpUnit().get_text_from_html(content) sentenceList = self.hanlp_tool.split_paragraph(text) extract_relationship = [] for sent in sentenceList: sent = sent.strip() relationships = self.eventExtractionByTemplateMatching( sent, relationship_list) # relationships = self.eventExtractionByTemplateMatching(text.strip(), relationship_list) for item in relationships: relation_id = item[0] cur_relationship = relationship_list[relation_id] extract_relationship.append({ "object_from_category": cur_relationship[1], "object_to_category": cur_relationship[3], "object_from_name": item[1], "object_relationship_name": item[2], "object_to_name": item[3] }) object1 = neo4j.match( object_from={ "label_name": cur_relationship[1], "content": { "名字": item[1] } }) object2 = neo4j.match( object_from={ "label_name": cur_relationship[3], "content": { "名字": item[3] } }) if object1 is not None and len( object1) == 1 and object2 is not None and len( object2) == 1: neo4j.createRelationship(labelOne=cur_relationship[1], labelTwo=cur_relationship[3], relationShipName=item[2], propertyOne={"名字": item[1]}, propertyTwo={"名字": item[3]}) if "relationship_extract_result" in i: extract_relationship = self.merge_list( extract_relationship, i["relationship_extract_result"]) cout += 1 print( str(cout) + "个文章" + ",抽取数量:" + str(len(extract_relationship))) collection.update_one({"_id": ObjectId(_id)}, { "$set": { "relationship_extract_result": extract_relationship } })
class MtimeSpider(Driver): def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False, spider_id='2'): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy) self.collection = Mongodb(db='knowledge', collection='text').get_collection() def get_news_from_one_page(self, ele=None): if ele is None: return None self.fast_click_page_by_elem(ele=ele) # self.fast_new_page(url) time.sleep(1) if self.judge_web_element_exist_by_css_selector( css_selector="p.newsinnerpageall > span > a"): show_all_page_btn = self.until_presence_of_element_located_by_css_selector( css_selector="p.newsinnerpageall > span > a") show_all_page_btn.click() try: news_title = self.until_presence_of_element_located_by_css_selector( css_selector="div.newsheader > div.newsheadtit").text news_time = re.findall( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", self.until_presence_of_element_located_by_css_selector( css_selector="div.newsheader > p.newstime").text)[0] news_source = self.until_presence_of_element_located_by_css_selector( css_selector="div.newsheader > p.newstime > span.ml15" ).text.split(":")[1] news_content = self.until_presence_of_element_located_by_css_selector( css_selector="div.newsnote").get_attribute( 'innerHTML' ) + self.until_presence_of_element_located_by_css_selector( css_selector="div#newsContent").get_attribute("innerHTML") news_author = \ self.until_presence_of_element_located_by_css_selector(css_selector="p.newsediter").text.split( ":")[1] except Exception: return None crwal_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) one_news = {} one_news.setdefault("标题", news_title) one_news.setdefault("时间", news_time) one_news.setdefault("来源", news_source) one_news.setdefault("内容", news_content) one_news.setdefault("作者", news_author) one_news.setdefault("crawl_from", self.get_current_url()) one_news.setdefault("crwal_time", crwal_time) self.close_curr_page() return one_news def get_news_infos(self, spider_id, user_id, repo_id, spider_name): url = "http://news.mtime.com/movie/1/" self.fast_new_page(url=url) time.sleep(1) final_result = [] flag = 0 while True: while self.judge_web_element_exist_by_css_selector( css_selector="div.newscontent > div#leftNews > a#viewmore" ): more_info_btn = self.until_presence_of_element_located_by_css_selector( css_selector="div.newscontent > div#leftNews > a#viewmore") self.scroll_to_center(more_info_btn) more_info_btn.click() time.sleep(1) news_list = self.until_presence_of_all_elements_located_by_css_selector( css_selector="ul#newslist > li") for item in news_list: one_news = self.get_news_from_one_page(ele=item) if one_news is None: continue print(one_news) judge_result = self.judge_data_exist_by_keys( collection=self.collection, keys={ "user_id": user_id, "repo_id": repo_id, "value.crawl_from": one_news["crawl_from"] }) if judge_result: final_result.append(one_news) else: flag = 1 break if flag == 1 or not self.judge_web_element_exist_by_css_selector( css_selector="div#pages > a.cur + a"): break else: next_page_btn = self.until_presence_of_element_located_by_css_selector( css_selector="div#pages > a.cur + a") self.fast_click_page_by_elem(ele=next_page_btn) time.sleep(1) if len(final_result) == 0: return one_data_acquisition_log = TDataAcquisitionLog.objects.create( create_time=timezone.now(), data_source_name=spider_name, data_access="爬虫", repo_id=int(repo_id), create_id=int(user_id), data_path="") TEntityExtractionLog.objects.create( data_acquisition_id=one_data_acquisition_log.id, is_extract=0, entity_number=0, extract_time=timezone.now(), create_id=int(user_id), repo_id=int(repo_id)) for item in final_result: self.collection.insert_one({ "file_id": one_data_acquisition_log.id, "category_id": -1, "spider_id": int(spider_id), "user_id": int(user_id), "repo_id": int(repo_id), "value": item })
from model.mongodb import Mongodb import json from bson import json_util, objectid a = b'{"_id":{"$oid":"5dfc557411647d60345088a3"},"datetime":"2011-01-01","movie_name":"\xe8\xae\xa9\xe5\xad\x90\xe5\xbc\xb9\xe9\xa3\x9e","release_time":"\xe4\xb8\x8a\xe6\x98\xa017\xe5\xa4\xa9","crawl_from":"\xe7\x8c\xab\xe7\x9c\xbc\xe4\xb8\x93\xe4\xb8\x9a\xe7\x89\x88","crawl_time":"2019-12-20 13:00:36","boxoffice_ratio":"47.7%","screenings_number":"11587","screenings_ratio":"37.3%","field_trips":"0","attendance_rate":"--","boxoffice_statistics":"3008.13","total_boxoffice":"5.20\xe4\xba\xbf"}\n' a = json.loads(a, object_hook=json_util.object_hook) print(type(a["_id"])) print(isinstance(a["_id"], objectid.ObjectId)) Mongodb(db="test1", collection="test1").get_collection().insert_one(a)
class DoubanSpider(Driver): # 爬取电影人的豆瓣url集合,用以筛去所有重复的url member_set = set() def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False, proxy_ip_from="", spider_id='2', data_queue=None): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy, proxy_ip_from=proxy_ip_from) self.movie_col = Mongodb(db='knowledge', collection='text').get_collection() # self.member_col = Mongodb(db='movies', collection='member').get_collection() # self.comment_col = Mongodb(db='movies', collection="comments").get_collection() def get_member_info(self, url=""): """ 获取一个电影人的具体个人信息 :param url: :return: """ self.fast_new_page(url=url) if "条目不存在" in self.driver.title or "页面不存在" in self.driver.title: self.close_curr_page() return None name = self.driver.title[:-4].strip() member_data = {} member_data.setdefault("member_name", name) member_data.setdefault("douban_url", url) member_div_infos = self.until_presence_of_all_elements_located_by_css_selector("div.info > ul > li") for item in member_div_infos: item = item.text.split(":") key = item[0].strip() if len(item) > 2: value = ":".join(item[1:]) else: value = item[1] if key == "性别" or key == "星座" or key == "出生日期" or key == "出生地" or key == "官方网站": member_data.setdefault(key, value.strip()) else: member_data.setdefault(key, [item.strip() for item in value.split("/")]) self.close_curr_page() return member_data # self.member_col.insert_one(member_data) # self.info_log(data="取得个人资料数据----" + member_data["member_name"]) # return True def get_member_awards(self, url=""): """ 获取一个电影人曾经获得的所有荣誉 :param url: :return: """ self.fast_new_page(url=url) awards_div = self.until_presence_of_element_located_by_css_selector("div.grid-16-8.clearfix > div.article") result = [] try: awards_info = self.until_presence_of_all_elements_located_by_css_selector(css_selector="div.awards", ele=awards_div, timeout=5) except Exception: self.close_curr_page() return result for temp in awards_info: awards_time = self.until_presence_of_element_located_by_css_selector(css_selector="div.hd > h2", ele=temp) awards = self.until_presence_of_all_elements_located_by_css_selector(css_selector="ul.award", ele=temp) for award in awards: data = {} award_info = self.until_presence_of_all_elements_located_by_css_selector(css_selector="li", ele=award) data.setdefault("time", awards_time.text) data.setdefault("award_from", award_info[0].text) data.setdefault("award", award_info[1].text) data.setdefault("relevant_movie", award_info[2].text) result.append(data) self.close_curr_page() return result def get_member_movies(self, url=""): """ 获取一个电影人参与过的所有电影列表 :param url: :return: """ movies = [] self.fast_new_page(url=url) while True: movies_a = self.until_presence_of_all_elements_located_by_css_selector("div.article > div.grid_view > ul > li > dl > dd > h6 > a") for temp in movies_a: movies.append(temp.text) try: self.vertical_scroll_to() next_page = self.until_presence_of_element_located_by_css_selector("div.article > div.paginator > span.next > a", timeout=5) next_page.click() time.sleep(1) except Exception: self.close_curr_page() return movies def get_comments(self, url="", movie_name="", movie_id=None): """ 获取单页的20条评论信息 :param url: :param movie_name: :return: """ self.fast_new_page(url=url) if "页面不存在" in self.driver.title or "条目不存在" in self.driver.title: self.close_curr_page() return comments_list = self.until_presence_of_all_elements_located_by_css_selector("div.article > div#comments.mod-bd > div.comment-item") if not self.judge_web_element_exist_by_css_selector(ele=comments_list[0], css_selector="div.comment"): self.close_curr_page() return for temp in comments_list: self.scroll_to_center(temp) data = {} commenter_name = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > h3 > span.comment-info > a", ele=temp) commenter_useful = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > h3 > span.comment-vote > span.votes", ele=temp) comment_content = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > p > span.short", ele=temp) comment_time = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > h3 > span.comment-info > span.comment-time", ele=temp) data.setdefault("movie_name", movie_name) data.setdefault("nickname", commenter_name.text) data.setdefault("useful", commenter_useful.text) data.setdefault("time", comment_time.text) data.setdefault("content", comment_content.text) data.setdefault("comment_from", "douban.com") if movie_id is not None: data.setdefault("movie_id", movie_id) if self.judge_web_element_exist_by_css_selector(ele=temp, css_selector="div.comment > h3 > span.comment-info > span.rating"): commenter_evaluate = self.until_presence_of_element_located_by_css_selector( css_selector="div.comment > h3 > span.comment-info > span.rating", ele=temp) data.setdefault("evaluate", commenter_evaluate.get_attribute("title")) else: data.setdefault("evaluate", "") # self.comment_col.insert_one(data) self.close_curr_page() def get_one_movie_info(self, ele=None): """ 获取电影详细数据 :param url: :return: """ self.fast_click_page_by_elem(ele=ele) time.sleep(1) # self.fast_new_page(url=url) if "页面不存在" in self.driver.title or "条目不存在" in self.driver.title: self.close_curr_page() return None try: actor_more = self.driver.find_element_by_css_selector("div#info > span.actor > span.attrs > a.more-actor") actor_more.click() mask = 1 except Exception: mask = 0 div_info = self.until_presence_of_element_located_by_css_selector(css_selector="div#info") infos = div_info.text info_list = infos.split("\n") movie_info = {} for info in info_list: info = info.split(":") key = info[0].strip() if len(info) == 1 or (len(info) == 2 and info[1] == ""): continue elif len(info) > 2: value = ":".join(info[1:]) else: value = info[1] if key == "官方网站": movie_info.setdefault(key, value.strip()) else: movie_info.setdefault(key, [item.strip() for item in value.split("/")]) # member_link = self.until_presence_of_all_elements_located_by_css_selector(css_selector="span span.attrs a", # ele=div_info) # if mask == 1: # member_link = member_link[:-1] # for item in member_link: # item_link = item.get_attribute("href") # if item_link in self.member_set: # continue # self.member_set.add(item_link) # actor_info = {"member_name": item.text, "douban_url": item_link} # self.dataQueue.put(actor_info) # self.close_curr_page() comment1 = self.until_presence_of_element_located_by_css_selector( "div#comments-section > div.mod-hd > h2 > span.pl > a") comment2 = self.until_presence_of_element_located_by_css_selector( "section#reviews-wrapper > header > h2 > span.pl > a") comment_number = int(re.findall(r'\d+', comment1.text)[0]) + int(re.findall(r'\d+', comment2.text)[0]) movie_info.setdefault("豆瓣评论数量", comment_number) self.close_curr_page() return movie_info def get_movie_infos(self, spider_id, user_id, repo_id, spider_name): self.fast_new_page( url="https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0") self.driver.refresh() if "页面不存在" in self.driver.title or "条目不存在" in self.driver.title: self.close_curr_page() return None # category_ul = self.until_presence_of_element_located_by_css_selector("ul.category") # category = self.until_presence_of_all_elements_located_by_css_selector(css_selector="li", ele=category_ul)[5:] # cur = 0 # description = category[cur].text # category[cur].click() time.sleep(1) css_selector = "div.list-wp a.item" elements_list = self.until_presence_of_all_elements_located_by_css_selector(css_selector=css_selector) final_result = [] for each in elements_list: data = {} self.vertical_scroll_to() time.sleep(1) self.scroll_to_center(ele=each) movie_link = each.get_attribute("href") movie_name = self.until_presence_of_element_located_by_css_selector(ele=each, css_selector="div.cover-wp > img") movie_score = self.until_presence_of_element_located_by_css_selector(ele=each, css_selector="p > strong") data.setdefault("电影名", movie_name.get_attribute("alt")) data.setdefault("豆瓣评分", movie_score.text) crwal_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) data.setdefault("crawl_from", movie_link) data.setdefault("crawl_time", crwal_time) movie_info = self.get_one_movie_info(ele=each) movie_info.update(data) print(movie_info) final_result.append(movie_info) if len(final_result) == 0: return one_data_acquisition_log = TDataAcquisitionLog.objects.create(create_time=timezone.now(), data_source_name=spider_name, data_access="爬虫", repo_id=int(repo_id), create_id=int(user_id), data_path="") TEntityExtractionLog.objects.create(data_acquisition_id=one_data_acquisition_log.id, is_extract=0, entity_number=0, extract_time=timezone.now(), create_id=int(user_id), repo_id=int(repo_id)) for item in final_result: judge_result = self.judge_data_exist_by_keys(collection=self.movie_col, keys={"user_id": user_id, "repo_id": repo_id, "value.电影名": item["电影名"], "value.crawl_from": item["crawl_from"]}) if judge_result is True: self.movie_col.insert_one( {"file_id": one_data_acquisition_log.id, "category_id": -1, "spider_id": int(spider_id), "user_id": int(user_id), "repo_id": int(repo_id), "value": item}) # def run(self): # """ # 单个线程启动方法,对每一个队列中的数据的url进行解析,找到对应的方法进行爬取对应数据 # :return: # """ # self.info_log(data="线程启动", name=self.name) # count = 0 # while not self.dataQueue.empty() and count == 0: # temp = self.dataQueue.get(False) # url_path = urlparse(temp["douban_url"]).path # while True: # try: # if "/celebrity" in url_path: # # 获取一条电影人详细数据 # member_info = self.get_member_info(temp["douban_url"]) # if member_info is None: # print("人物数据不存在") # break # member_awards = self.get_member_awards(temp["douban_url"] + "awards") # member_movies = self.get_member_movies(temp["douban_url"] + "movies") # member_info.setdefault("awards", member_awards) # member_info.setdefault("acting_movies", member_movies) # self.member_col.insert_one(member_info) # self.info_log(data="成功获取并存储一条人物数据-----" + member_info["member_name"], name=self.threadName) # elif "/subject" in url_path and "/subject_search" not in url_path and "/comments" not in url_path: # # 获取一条电影数据,成功获取电影数据后将他的影评url数据压入队列 # movie_info = self.get_movie_info(temp["douban_url"]) # if movie_info is None: # print("电影数据不存在") # break # movie_info.update(temp) # self.movie_col.insert_one(movie_info) # self.info_log(data="成功获取并存储一条电影数据-----" + movie_info["movie_name"], name=self.threadName) # print(movie_info) # comments_url = temp["douban_url"] + "comments?start=0&limit=20&sort=new_score&status=P" # self.dataQueue.put({"movie_name": temp["movie_name"], "douban_url": comments_url, "movie_id": movie_info["_id"]}) # elif "/subject" in url_path and "/comments" in url_path: # # 对url解析,爬取200条影评数据 # bits = list(urlparse(temp["douban_url"])) # qs = parse_qs(bits[4]) # start = int(qs["start"][0]) # while start <= 200: # qs["start"][0] = start # bits[4] = urlencode(qs, True) # temp["douban_url"] = urlunparse(bits) # self.get_comments(temp["douban_url"], temp["movie_name"], temp["movie_id"]) # start += 20 # count = 0 # break # except Exception: # # 累计失败次数,每次失败后更换换代理ip,若连续失败5次则线程结束 # count += 1 # if count > 5: # self.dataQueue.put(temp) # break # self.change_ip(self.get_ip(self.proxy_ip_from)) @staticmethod def get_data_source(): """ 获取已获取的电影人url :return: """ member_col = Mongodb(db='movies', collection='member').get_collection() url_set = set() for item in member_col.find(): url_set.add(item["douban_url"]) return url_set
class MaoyanSpider(Driver): def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False, spider_id='2'): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy) self.boxoffice_col = Mongodb(db='knowledge', collection='text').get_collection() self.news_col = Mongodb(db='movies1', collection='news').get_collection() @staticmethod def find_key_from_value(dict, value): key_list = dict.keys() for key in key_list: if value == dict[key]: return key return None def get_boxoffice_infos_from_one_page(self, url="", datetime="", user_id=-1, repo_id=-1): """ 获取猫眼此时刻票房数据 :param repo_id: :param user_id: :param datetime: :param url: :return: """ self.fast_new_page(url=url) time.sleep(1) if not self.judge_web_element_exist_by_css_selector( css_selector="div.dashboard-content"): self.close_curr_page() return True theads = self.until_presence_of_all_elements_located_by_css_selector( css_selector= "div.dashboard-list > table.dashboard-table.table-header > thead > tr > th" )[1:] theads = [item.text for item in theads] if not self.judge_web_element_exist_by_css_selector( css_selector= "div.movielist-container > div.movielist > table.dashboard-table > tbody > tr" ): self.close_curr_page() return False boxoffice_infos = self.until_presence_of_all_elements_located_by_css_selector( css_selector= "div.movielist-container > div.movielist > table.dashboard-table > tbody > tr" ) crwal_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) boxoffice_data_from_the_page = [] for item in boxoffice_infos: one_boxoffice_data = {} boxoffice_info = self.until_presence_of_all_elements_located_by_css_selector( css_selector="td", ele=item) movie_name = self.until_presence_of_element_located_by_css_selector( css_selector="div > div.moviename-desc > p.moviename-name", ele=boxoffice_info[0]) movie_info = self.until_presence_of_all_elements_located_by_css_selector( css_selector= "div > div.moviename-desc > p.moviename-info > span", ele=boxoffice_info[0]) one_boxoffice_data.setdefault("日期", datetime) one_boxoffice_data.setdefault("电影名", movie_name.text) one_boxoffice_data.setdefault("上映时间", movie_info[0].text) one_boxoffice_data.setdefault("总票房", movie_info[1].text) boxoffice_info = boxoffice_info[1:] for i in range(len(boxoffice_info)): one_boxoffice_data.setdefault(theads[i], boxoffice_info[i].text) one_boxoffice_data.setdefault("crawl_time", crwal_time) one_boxoffice_data.setdefault("crawl_from", "猫眼专业版") # self.piaofang_col.insert_one(one_piaofang_data) judge_result = self.judge_data_exist_by_keys( collection=self.boxoffice_col, keys={ "user_id": user_id, "repo_id": repo_id, "value.日期": one_boxoffice_data["日期"], "value.电影名": one_boxoffice_data["电影名"], "value.crawl_from": one_boxoffice_data["crawl_from"] }) if judge_result is True: boxoffice_data_from_the_page.append(one_boxoffice_data) else: return boxoffice_data_from_the_page, False self.close_curr_page() return boxoffice_data_from_the_page, True def get_boxoffice_infos(self, spider_id, user_id, repo_id, spider_name): date = datetime.datetime.strptime("2020-01-23", '%Y-%m-%d') # date = datetime.datetime.now() final_result = [] while True: data_list, result = self.get_boxoffice_infos_from_one_page( url="http://piaofang.maoyan.com/dashboard/movie?date=" + str(date)[:10], datetime=str(date)[:10], user_id=int(user_id), repo_id=int(repo_id)) final_result.extend(data_list) if result is False: break date = date + datetime.timedelta(days=-1) if len(final_result) == 0: return one_data_acquisition_log = TDataAcquisitionLog.objects.create( create_time=timezone.now(), data_source_name=spider_name, data_access="爬虫", repo_id=int(repo_id), create_id=int(user_id), data_path="") TEntityExtractionLog.objects.create( data_acquisition_id=one_data_acquisition_log.id, is_extract=0, entity_number=0, extract_time=timezone.now(), create_id=int(user_id), repo_id=int(repo_id)) for item in final_result: self.boxoffice_col.insert_one({ "file_id": one_data_acquisition_log.id, "category_id": -1, "spider_id": int(spider_id), "user_id": int(user_id), "repo_id": int(repo_id), "value": item }) def run_spider(self, url=""): lastest_info = self.boxoffice_col.find().sort("datetime", -1).limit(1) date = datetime.datetime.strptime(lastest_info[0]["datetime"], '%Y-%m-%d') date = date + datetime.timedelta(days=1) now = datetime.datetime.now() while date < now: self.get_boxoffice_infos_from_one_page( "http://piaofang.maoyan.com/dashboard/movie?date=" + str(date)[:10], str(date)[:10]) date = date + datetime.timedelta(days=1)
def update_t_mapping_rule(self, repo_id, create_id): #显然是要对每个类目进行计算 return_category = TCategory.objects.filter(repo_id=repo_id, create_id=create_id) category_name_list = [] category_id_list = [] for val in return_category: val_dict = model_to_dict(val) category_name_list.append(val_dict['category_name']) category_id_list.append(val_dict['id']) list_len = len(category_name_list) #print(list_len) #print(category_name_list) #print(category_id_list) for i in range(0, list_len): tmp_id = category_id_list[i] attribute_name_alias_map = {} return_attribute = TAttribute.objects.filter(category_id=tmp_id) #把这个函数封装一下不然就太长了 #输入是查询的所有attribute 返回一个map #attribute 不仅是自己的attribute 还有父亲节点的attribute #这边这个attribute_name_alias_map里面还要放进去他们夫妻节点的东西 attribute_name_alias_map = self.return_attribute_name_map( return_attribute, attribute_name_alias_map) #print(attribute_name_alias_map) ret_cate = TCategory.objects.get(id=tmp_id) ret_cate_dict = model_to_dict(ret_cate) father_category_id = ret_cate_dict['father_category_id'] #print(father_category_id,type(father_category_id)) if (str(-1) != father_category_id): return_attribute_father = TAttribute.objects.filter( category_id=father_category_id) attribute_name_alias_map = self.return_attribute_name_map( return_attribute_father, attribute_name_alias_map) ret_cate_father = TCategory.objects.get(id=father_category_id) ret_cate_dict_father = model_to_dict(ret_cate_father) father_father_category_id = ret_cate_dict_father[ 'father_category_id'] #print(father_father_category_id) if (str(-1) != father_father_category_id): return_attribute_father_father = TAttribute.objects.filter( category_id=father_father_category_id) attribute_name_alias_map = self.return_attribute_name_map( return_attribute_father_father, attribute_name_alias_map) #print(attribute_name_alias_map) news_col = Mongodb(db='knowledge', collection='text').get_collection() #print(list_len) # 这边所有的名字已经在attribute_name_alias_map里面 _insert_mapping_rule_attribute_name_list = [] _insert_mapping_rule_attribute_coverage_rate_list = [] category_id = category_id_list[i] #从mongodb里面找 attribute_name_map = {} tmp_list = news_col.find({'category_id': category_id}) num = 0 for val in tmp_list: #print(val) num += 1 if val is not None: for key in val.keys(): #print(key) #print(attribute_name_map) #print(key in attribute_name_map ) if (key == '_id' or key == 'file_id' or key == 'category_id'): continue elif (key in attribute_name_map): attribute_name_map[key] += 1 else: attribute_name_map[key] = 1 #这边这个在没有实体的时候 假如说t_mapping_rule里面有多余的值那么就要进行更新 #所以只要有一个就不用删除一旦一个都没有了 那么就不用删除 #如果你想写得再细致一点那么就确认这个属性还在 假如说不在了那么就删除 # delete_id_list = [] #print(111) return_mapping_rule = TMappingRule.objects.filter( category_id=category_id, create_id=create_id) #print(111) for rule in return_mapping_rule: rule_dict = model_to_dict(rule) print(rule_dict) rule_dict_id = rule_dict['id'] rule_dict_attribute_name = rule_dict['attribute_name'] if rule_dict_attribute_name not in attribute_name_map.keys(): delete_id_list.append(rule_dict_id) for mapping_rule_id in delete_id_list: #其实删除的话最好存到日志里面不然又会出问题 rule_mapping = TMappingRule.objects.get(id=mapping_rule_id) rule_mapping.delete() # #print(attribute_name_map) if attribute_name_map is not None: for key in attribute_name_map.keys(): if (key in attribute_name_alias_map): a = 1 else: _insert_mapping_rule_attribute_name_list.append(key) coverage_rate = 1.0 * attribute_name_map[key] / num _insert_mapping_rule_attribute_coverage_rate_list.append( coverage_rate) attribute_name_list_len = len( _insert_mapping_rule_attribute_name_list) dt = datetime.now() #print(attribute_name_list_len) #print(_insert_mapping_rule_attribute_name_list) for k in range(0, attribute_name_list_len): attribute_name_val = _insert_mapping_rule_attribute_name_list[ k] attribute_coverage_val = _insert_mapping_rule_attribute_coverage_rate_list[ k] obj = TMappingRule.objects.filter( attribute_name=attribute_name_val, create_id=create_id).first() #print(attribute_name_val,attribute_coverage_val) if (obj is None): # create TMappingRule.objects.create( attribute_name=attribute_name_val, coverage_rate=attribute_coverage_val, create_time=str(dt)[:19], category_id=category_id, create_id=create_id) else: # upadte obj.coverage_rate = attribute_coverage_val obj.create_time = str(dt)[:19] obj.save() return 1
class BaikeSpider(Driver): urls = [] # tags = ["电影", "演员", "导演", "编剧", "制片人"] count = 0 def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False, proxy_ip_from="", spider_id='2'): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy, proxy_ip_from=proxy_ip_from) # self.baike_col = Mongodb(db='movies1', collection="baike_member").get_collection() self.baike_col = Mongodb(db='baike', collection="test1").get_collection() def get_infos(self, url="", extensive_properties=None): if extensive_properties is None: extensive_properties = {} self.fast_new_page(url=url) relationship_urls = [] relationship_tags = [] if self.judge_web_element_exist_by_css_selector( css_selector= "div.polysemantList-header-title > div.toggle.expand"): synonym = self.until_presence_of_element_located_by_css_selector( css_selector= "div.polysemantList-header-title > div.toggle.expand > a") self.scroll_to_center(synonym) synonym.click() member_urls = self.until_presence_of_all_elements_located_by_css_selector( css_selector= "ul.polysemantList-wrapper.cmn-clearfix > li.item > a") for item in member_urls: # for tag in self.tags: # if tag in item.text: relationship_urls.append(item.get_attribute("href")) relationship_tags.append(item.text) # break if self.driver.current_url not in self.urls: data = self.get_base_info_from_baike() if data is not None: current_tag = self.until_presence_of_element_located_by_css_selector( css_selector= "ul.polysemantList-wrapper.cmn-clearfix > li.item > span.selected" ) data.setdefault("tag", current_tag.text) data.update(extensive_properties) print(data) self.baike_col.insert_one(data) self.urls.append(self.driver.current_url) self.close_curr_page() for item in relationship_urls: if item not in self.urls: self.fast_new_page(url=item) data = self.get_base_info_from_baike() if data is not None: data.setdefault( "tag", relationship_tags[relationship_urls.index(item)]) data.update(extensive_properties) print(data) self.baike_col.insert_one(data) self.urls.append(item) self.close_curr_page() if self.count == 10: return False return True def get_base_info_from_baike(self): try: if not self.judge_web_element_exist_by_css_selector( css_selector= "div.content > div.main-content div.basic-info.cmn-clearfix" ): return basic_info_div = self.until_presence_of_element_located_by_css_selector( css_selector= "div.content > div.main-content div.basic-info.cmn-clearfix") if self.judge_web_element_exist_by_css_selector( ele=basic_info_div, css_selector="a.toggle.toExpand"): btn = self.until_presence_of_element_located_by_css_selector( ele=basic_info_div, css_selector="a.toggle.toExpand") self.scroll_to_center(btn) btn.click() basic_info_name = self.until_presence_of_all_elements_located_by_css_selector( css_selector="dl > dt.basicInfo-item.name", ele=basic_info_div) basic_info_value = self.until_presence_of_all_elements_located_by_css_selector( css_selector="dl > dd.basicInfo-item.value", ele=basic_info_div) data = {} for i in range(len(basic_info_name)): name = basic_info_name[i].text.replace(" ", "") value = basic_info_value[i].text if name == "" or value.replace(" ", "") == "": continue data.setdefault(name, value) data.setdefault("url", self.driver.current_url) if self.judge_web_element_exist_by_css_selector( css_selector="div.lemma-summary"): base_infos = self.until_presence_of_element_located_by_css_selector( css_selector="div.lemma-summary").text data.setdefault("基础信息", base_infos) self.count = 0 return data except Exception: self.count += 1
def eventExtraction(self, request, file_id, lEventCategoryId): """ 功能 进行模板匹配的事件抽取 :param request: request参数 :param file_id: 数据类型str 文件id :param lEventCategoryId: 数据类型list 事件类目id :return: True """ #加入ruleId 1或者2 #1的事件是三元组主谓宾 2的话变事件是主谓 #only for debug #request.session['user_id'] = 1 #request.session['repo_id'] = 1 #fileId = 13 #only for debug #fileId = request.POST['fileId'] #request.session['repo_id']=1 #request.session['user_id']=1 repoId = request.session['repo_id'] createId = request.session['user_id'] #存到这个file_id 里面 tmp_info = {'file_id': file_id, 'user_id': createId, 'repo_id': repoId} news_col = Mongodb(db='knowledge', collection='text').get_collection() cnt = 1 ret_entity = news_col.find(tmp_info) ret_entity_map = list() for item in ret_entity: if "内容" in item["value"]: ret_entity_map.append(item) if len(ret_entity_map) == 0: return print("--------------------事件抽取") #在这个之前把所有的词语都加进去 #整个循环都是为了把这个repoId的所有的触发词以及他们的事件主题客体都加入进去 retTriggerWordList = TTriggerWord.objects.filter(repo_id=repoId) eventLabelList = [] # hanlpUnit=HanlpUnit() #这边要修 我们要从事类目开始查询 for i in retTriggerWordList: tmpLableList = [] ruleId = 1 retTriggerWordDict = model_to_dict(i) triggerId = retTriggerWordDict['id'] eventId = retTriggerWordDict['event_rule_id'] #print(111,eventId) #触发词名字和触发词标注 retEventRule = TEventRule.objects.get(id=eventId) #print(333,retEventRule.category_id) retCategoryName = TCategory.objects.get( id=retEventRule.category_id).category_name #print(444,retCategoryName) #这里的时候触发词的label要变成事件的label #到时候改一下 triggerWord = retTriggerWordDict['trigger_word'] triggerWordId = BaseController.get_category_name( request, retCategoryName) #print(222,eventId) eventRule = TEventRule.objects.get(id=eventId, repo_id=repoId) eventRuleDict = model_to_dict(eventRule) eventCategoryId = eventRuleDict['category_id'] if (eventCategoryId not in lEventCategoryId): continue eventCategory = TCategory.objects.get(id=eventCategoryId, repo_id=repoId, create_id=createId) eventCategoryDict = model_to_dict(eventCategory) eventCategoryName = eventCategoryDict['category_name'] tmpLableList.append(eventCategoryName) #事件类目 subjectCategoryId = eventRuleDict['event_subject_id'] subjectCategory = TCategory.objects.get(id=subjectCategoryId, repo_id=repoId, create_id=createId) subjectCategoryDict = model_to_dict(subjectCategory) subjectCategoryName = subjectCategoryDict['category_name'] subjectId = BaseController.get_category_name( request, subjectCategoryName) tmpLableList.append(subjectId) retListId, retListVal = some_data_deal_func( ).inputCategoryIdReturnName(subjectCategoryId, repoId, createId) #对于retListVal里面的所有的值都把他们加入到分词器中然后进行分词 #构造wordList word 和mask 对应 constructWordList = [] tmpSet = self.hanlp_tool.added_word_list #print(len(retListVal )) for word in retListVal: if (word == None): continue tmpDict = {} tmpDict['word'] = word #print(word) #item["word"], item["mask"] tmpDict['mask'] = subjectId constructWordList.append(tmpDict) #这边这个要加入list[{'word':123,mask:13}] self.hanlp_tool.add_word_list(constructWordList) #print(constructWordList) objectCategoryId = eventRuleDict['event_object_id'] negativeOne = -1 if (objectCategoryId == negativeOne): ruleId = 2 constructWordList = [] tmpDict = {} tmpDict['word'] = triggerWord tmpDict['mask'] = str(triggerWordId) tmpSet = self.hanlp_tool.added_word_list constructWordList.append(tmpDict) self.hanlp_tool.add_word_list(constructWordList) tmpLableList.append(str(triggerWordId)) print(ruleId) if (ruleId == 1): objectCategoryId = eventRuleDict['event_object_id'] objectCategory = TCategory.objects.get(id=objectCategoryId, repo_id=repoId, create_id=createId) objectCategoryDict = model_to_dict(objectCategory) objectCategoryName = objectCategoryDict['category_name'] objectId = BaseController.get_category_name( request, objectCategoryName) retListId, retListVal = some_data_deal_func( ).inputCategoryIdReturnName(objectCategoryId, repoId, createId) tmpLableList.append(objectId) constructWordList = [] tmpSet = self.hanlp_tool.added_word_list #这个代码有变动需要改一下 for word in retListVal: if (word == None): continue tmpDict = {} tmpDict['word'] = word # item["word"], item["mask"] tmpDict['mask'] = str(objectId) constructWordList.append(tmpDict) # 这边这个要加入list[{'word':123,mask:13}] #print(constructWordList) self.hanlp_tool.add_word_list(constructWordList) eventLabelList.append(tmpLableList) #eventLabelList #事件类目 事件主题 事件触发词 事件客体 #print(eventLabelList) # print("list里面内容") # tmpS=self.hanlp_tool.added_word_list # for name in tmpS: # print(name) #print("list里面内容结束") #return True #name attribute = TAttribute.objects.get(category_id=1) attributeDict = model_to_dict(attribute) attributeName = attributeDict['attribute_name'] #print(self.hanlp_tool.added_word_list) cnt = 1 for i in ret_entity_map: _id = i['_id'] #根据这个id放回去就好了 value = i['value'] basetime = str(value['时间']) content = value['内容'] text = HanlpUnit().get_text_from_html(content) sentenceList = self.hanlp_tool.split_paragraph(text) #print(sentenceList) #这边把所有的东西都拿出来 event_extract_result = [] count = 0 countIndex = 0 #时间 地点 事件主体 事件客体 主体的类目 和客体的类目 tmpEventSet = set() for sent in sentenceList: sent = sent.strip() #print(sent) #对每一个sent进行分词获取他们的事件 #11111 #sent="浙江杭州明天林更新出演动作喜剧《快手枪手快枪手》" sentenceDealResult = self.hanlp_tool.cut(sent) event = self.eventExtractionByTemplateMatching( sent, eventLabelList) #事件抽取完成 #dateTime还要调整一下basetime会出问题 #print(basetime) dateTime = basetime timeIndex = -1 #print(123,timeIndex) timeIndex, timeWord, dateTime = Time_deal().dealTime( sent, basetime) if (timeIndex != -1): timeIndex = timeIndex + countIndex #print(46, timeIndex) #print(11111111,dateTime) locationList = Time_deal().dealArea(sent) location = '' locationindex = -1 for val in locationList: if (len(val['place']) > len(location)): location = val['place'] locationindex = val['index'] + countIndex #print(location,locationindex) countIndex += len(sentenceDealResult) #这三个的名字需要和事件一起返回 #print(event) for eve in event: ruleId = 1 if (len(eve) == 3): ruleId = 2 eveId = eve[0] subjectLabel = eventLabelList[eveId][1] #triggerLabel = BaseController.get_category_name()eventLabelList[eveId][0] attribute = {} attribute['发生时间'] = dateTime attribute['地点'] = location eveString = '' for j in range(1, len(eve), 1): eveString = eveString + str(eve[j]) attribute['名字'] = eveString #eventlabel要通过查询结果得到 eventLabel = BaseController.get_category_name( request, eventLabelList[eveId][0]) #print(eventLabel) #print(eventLabelList[eveId]) #print(event) subjectLabel = eventLabelList[eveId][1] Neo4j().create_node_mjy_edition(eventLabel, attribute) subjectNameVal = eve[1] # print(subjectCategoryName,attributeName,subjectNameVal) neo4jSubjectId = Neo4j().quesIdByLabelAttribute( subjectLabel, attributeName, '\'' + subjectNameVal + '\'') neo4jEventId = Neo4j().quesIdByLabelAttribute( eventLabel, '名字', '\'' + eveString + '\'') Neo4j().createRelationship(subjectLabel, eventLabel, "主谓关系", {'id': neo4jSubjectId}, {'id': neo4jEventId}) if (ruleId == 1): objectNameVal = eve[3] objectLabel = eventLabelList[eveId][3] neo4jObjectId = Neo4j().quesIdByLabelAttribute( objectLabel, attributeName, '\'' + objectNameVal + '\'') Neo4j().createRelationship(eventLabel, objectLabel, "动宾关系", {'id': neo4jEventId}, {'id': neo4jObjectId}) #print(neo4jSubjectId, neo4jEventId, neo4jObjectId) tmpEventDict = {} tmpEventDict['actual_event_time'] = dateTime #事件抽取内容拿出来 tmpEventDict['time'] = timeWord tmpEventDict['timeIndex'] = timeIndex tmpEventDict['location'] = location tmpEventDict['locationIndex'] = locationindex #print(111,dateTime,location) tmpEventDict['eventSubject'] = eve[1] tmpEventDict['eventSubjectLabel'] = subjectLabel tmpEventDict['triggerLabel'] = eventLabel tmpEventDict['triggerWord'] = eve[2] tmpEventDict['eventName'] = eveString if (ruleId == 1): tmpEventDict['eventObject'] = eve[3] objectLabel = eventLabelList[eveId][3] tmpEventDict['eventObjectLabel'] = objectLabel if (eveString not in tmpEventSet): tmpEventSet.add(eveString) event_extract_result.append(tmpEventDict) print(tmpEventDict) count += 1 #插入到mongodb #print(count,event_extract_result) news_col.update_one( {'_id': _id}, {"$set": { 'event_extract_result': event_extract_result }}) #news_col.insert_one() cnt += 1 #if(cnt>=2): # break return True
def __init__(self): self.__context = {} self.knowledge_col = Mongodb(db='knowledge', collection='text').get_collection()
def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False, proxy_ip_from="", spider_id='2', data_queue=None): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy, proxy_ip_from=proxy_ip_from) self.movie_col = Mongodb(db='knowledge', collection='text').get_collection()