def scrape_detail(self, target, detail_page_bundle, content_ruler, encode): try: order = self.web_order ulweb = UlWebRuler() result = ulweb.scan_detail(target, detail_page_bundle, order, content_ruler, encode) code, detail_page_result = result if code == 1: if detail_page_result is not None: return (1, detail_page_result) elif code == -3: self.temp_list.append( (target, detail_page_bundle, content_ruler, encode)) except WebTargetOutOfDateException as e: return (-1, e.args[0]) except Exception as e: import traceback LogGo.warning(repr(e)) return (-1, e.args[0]) return (0, None)
def gensto(self, dao, list): status = False count = len(list) element = 0 if count > 0: for item in list: try: id = genUUID() dao.save_or_update(item, id) element += 1 except Exception as e: import traceback msg = traceback.format_exc() # print(msg) LogGo.warning(repr(e)) LogGo.info('Total :' + str(count) + ' / ' + str(element) + ' elements Saved!') if element == 0: status = False else: status = True else: LogGo.info("0 Element!") stauts = False return status
def scan_list(self, target, exists): self.limited_forward_count = target.limited_forward_count self.limited_attitude_count = target.limited_attitude_count list = [] result_list = [] """模拟登陆""" status = 'you got it' """如果登陆成功""" if status != '': self.loops(target,exists,list) if len(list) < 1: return (0, (target, None, None, None)) else: LogGo.warning("Weibo: Loop scan faild!") return (-1, (target, None, None, None)) if len(list) > 0: list = self.purify(list) list.reverse() for item in list: if exists.count(item['id']) < 1: result_list.append(item) LogGo.debug('newrank list length:' + str(len(result_list))) if len(result_list) > 0: return (1, (target, list, None, None)) return(-1, (target, None, None, None))
def store_program_type(self, pair): status = False count = len(pair) element = 0 if count > 0: for item in pair: try: program, type = item self.program_type.save_by_program_type(program, type) element += 1 except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) LogGo.info('Total :' + str(count) + ' / ' + str(element) + ' elements Saved!') if element == 0: status = False else: status = True else: LogGo.info("0 Element!") return status
def newrank_detail(self, target, detail_page_bundle, content_ruler, encode): LogGo.info("On newrank detail: " + str(target.data_key)) try: order = self.wechat_order newrank = NewrankRuler() result = newrank.scan_detail(target, detail_page_bundle, order, content_ruler, encode) code, detail_page_result = result if code == 1: if detail_page_result is not None: return (1, detail_page_result) elif code == -3: self.temp_list.append( (target, detail_page_bundle, content_ruler, encode)) except Exception: LogGo.warning('error in newrank detail') return (-1, None) return (0, None)
def genup(self, dao, list): status = False count = len(list) element = 0 if count > 0: for (update, where) in list: try: dao.update(update, where) element += 1 except Exception as e: import traceback msg = traceback.format_exc() # print(msg) LogGo.warning(repr(e)) LogGo.info('Total :' + str(count) + ' / ' + str(element) + ' elements Updated!') if element == 0: status = False else: status = True else: LogGo.info("0 Element!") stauts = False return status
def store_count(self, list): status = False count = len(list) element = 0 update = 0 if count > 0: for program in list: try: if self.check_for_exists(program) == 1: update += 1 else: id = genUUID() self.pc.save(program, id) element += 1 except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) LogGo.info('Total :' + str(count) + ' (' + str(element) + ' Saved, ' + str(update) + ' Updated)') if element == 0 and update == 0: status = False else: status = True else: LogGo.info("0 Element!") # return False return status
def store_soap_target(self, targets: [], banned: bool = False): status = False element = 0 for target in targets: try: if banned: self.banned_program.save(target) else: self.soap_target.save(target) element += 1 except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) LogGo.info('Total :' + str(element) + ' / ' + str(len(targets)) + ' elements Saved!') if element == 0: status = False else: status = True return status
def store_program(self, ids, programs): status = False if len(ids) != len(programs): LogGo.error("ids count unmatch programs count") return False count = len(ids) element = 0 if count > 0: for id, program in zip(ids, programs): try: self.program.save(program, id) element += 1 except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) LogGo.info('Total :' + str(count) + ' / ' + str(element) + ' elements Saved!') if element == 0: status = False else: status = True else: LogGo.info("0 Element!") return status
def gs_detail(self, target, detail_page_bundle, content_ruler, encode): LogGo.info("On gs detail: " + str(target.data_key)) try: order = self.wechat_order gs = GsdataRuler() result = gs.scan_detail(target, detail_page_bundle, order, content_ruler, encode) code, detail_page_result = result if code == 1: if detail_page_result is not None: return (1, detail_page_result) elif code == 3: self.temp_list.append( (target, detail_page_bundle, content_ruler, encode)) except Exception as e: import traceback LogGo.warning(repr(e)) return (-1, e.args[0]) return (0, None)
def purify(self,list): if len(list) < 1: return [] result = [] for i in list: try: flag = True id = i['id'] text = i['text'] limited_attitude_count = i['attitudes_count'] limited_forward_count = i['reposts_count'] program_count = 0 # 基础过滤重复id for seq in result[::-1]: sid = seq['id'] if id == sid: flag = False break # 第一次节目名过滤(有可能会包含到非节目) if flag and text.count('《') < 1: flag = False # 条二次参数过滤 if flag and self.limited_attitude_count is not None and limited_attitude_count is not None: if limited_attitude_count < self.limited_attitude_count: flag = False if flag and self.limited_forward_count is not None and limited_forward_count is not None: if limited_forward_count < self.limited_forward_count: flag = False # 第三次依据节目名过滤 if flag: for program in self.exist_program: if text.count(program) >= 1: program_count = program_count + 1 if program_count > 3: flag = False break if flag: result.append(i) except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) return result
def start(self): try: if self.config.check_table: self.check_data_base() # RequestHelperClassVer.init(self.config) # ProxyHelper.init(self.config) # MysqlHelper.init(self.config) self.start_mormal_mission() except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg)
def gs_list(self, target): LogGo.info("On gs list: " + str(target.data_key)) try: exists = self.exists_signature gs = GsdataRuler() code, value = gs.scan_list(target, exists) return (code, value) except Exception as e: import traceback LogGo.warning(repr(e)) return (0, None)
def newrank_list(self, target): LogGo.info("On newrank list: " + str(target.data_key)) try: exists = self.exists_title newrank = NewrankRuler() code, value = newrank.scan_list(target, exists) return (code, value) except Exception as e: import traceback LogGo.warning(repr(e)) return (0, None)
def weibo_list(self, target): LogGo.info("On weibo list: " + str(target.data_key)) try: exists = self.exists_identifier weibo = WeiboRuler() code, value = weibo.scan_list(target, exists) return (code, value) except Exception as e: import traceback LogGo.warning(repr(e)) return (0, None)
def store(self, news_list: [], article_list: [], heavy_list: [] = None): status = False if len(article_list) != len(news_list): LogGo.error("news count unmatch article count") return False count = len(news_list) element = 0 if count > 0: if heavy_list is not None: for news, article, heavy in zip(news_list, article_list, heavy_list): try: id = genUUID() self.news.save(news, id) self.article.save(article, id) self.heavy.save_with_news_id(heavy, id) element += 1 except Exception as e: LogGo.warning(repr(e)) else: for news, article in zip(news_list, article_list): try: id = genUUID() self.news.save(news, id) self.article.save(article, id) element += 1 except Exception as e: LogGo.warning(repr(e)) LogGo.info('Total :' + str(count) + ' / ' + str(element) + ' elements Saved!') if element == 0: status = False else: status = True # return True else: LogGo.info("0 Element!") # return False return status
def scrape_list(self, target): LogGo.info("On scrape list: " + str(target.data_key)) try: exists = self.exists_url ulweb = UlWebRuler() code, value = ulweb.scan_list(target, exists) return (code, value) except WebTargetOutOfDateException as e: LogGo.warning(e.args[0]) except Exception as e: import traceback LogGo.warning(repr(e)) return (0, None)
def start(self): result = 0 try: LogGo.info('搜狗爬虫') # Updatemp.loot() return result except Exception as e: import traceback msg = traceback.format_exc() print(msg) LogGo.warning(e) result = -1 return result
def store_soap(self, soap): status = False if len(soap) < 1: LogGo.info("no data to save!") return False try: id = genUUID() self.soap.save(soap, id) status = True except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) LogGo.info('Soap Saved') return status
def weibo_detail(self, target, detail_page_bundle, content_ruler, encode): LogGo.info("On weibo detail: " + str(target.data_key)) try: order = self.weibo_order weibo = WeiboRuler() detail_page_result_dic = weibo.scan_detail(target, detail_page_bundle, order, content_ruler, encode) if detail_page_result_dic is not None: return (1, detail_page_result_dic) except Exception as e: import traceback LogGo.warning(repr(e)) return (-1, e.args[0]) return (0, None)
def scan_list(self, target, exists): """请求参数""" par = (['flag', 'true'], ['uuid', target.extra0]) """抓取关键字""" keys = [ 'title', 'author', 'publicTime', 'url', 'clicksCount', 'likeCount', 'publicTime', 'summary' ] list = [] result_list = [] try: raw = RequestHelper.post(NewrankRuler.url, par, file_cookie=Configs.newrank_cookie_file) except Exception as e: import traceback msg = traceback.format_exc() # print(msg) LogGo.warning(msg) return (-1, (target, None, None, None)) try: list = ExtraJSON.extra_newrank_wechat_list(raw, keys) except: return (-1, (target, None, None, None)) if len(list) > 0: list.reverse() for item in list: if exists.count(item['title']) < 1: result_list.append(item) LogGo.debug('newrank list length:' + str(len(result_list))) if len(result_list) > 0: return (1, (target, list, None, None)) return (-1, (target, None, None, None))
def anyList(rawData, keys, start): par_list = keys # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") con = [] offect = start dic = dict() for key in par_list: try: content = ExtraJSON.getContent(rawData, key, offect) con.append(content) offect += content[1] # print(key + " : " + content[0]) dic[key] = content[0] except Exception as e: LogGo.warning(e) continue # print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<") # print(" ") minLen = 0 for i in con: minLen += i[1] try: nextIndex = rawData.index(keys[0], start + minLen) except ValueError as e: #LogGo.warning(e) return (con, 0) return (con, nextIndex, dic)
def loops(self,target,exists,result): try: base_url = target.extra0 for i in range(0, Configs().length_weibo): # [::-1]: print("page: " + str(i)) list = self.build_and_request(WeiboRuler.keys, base_url, WeiboRuler.request_getindex, i) if len(list) == 0: break for item in list: """日常抓取时的重复验证""" if exists.count(item['id']) < 1: result.append(item) else: return except Exception as e: import traceback msg = traceback.format_exc() print(msg) LogGo.warning(repr(e)) LogGo.warning("Scan Failed!") return
def scan_list(self, target, exists) -> (): # order = ScrappdeDataDao.get_max_order('web') # 数据库中排序代码 detail_page_bundle_list = [] error_code = None first = target.extra0#'http://ent.people.com.cn/GB/81374/index1.html' second = target.extra1#'http://ent.people.com.cn/GB/81374/index2.html' third = target.extra2#'http://ent.people.com.cn/GB/81374/index3.html' str_parent_container = target.extra3#'(div,[class:ej_list_box clear])' str_list_container = target.extra4#'(li,[])' list_ruler = target.extra5#'link:a href()=;title:a;data:em' content_ruler = target.extra6 list_json_path = target.extra9 url = '' one = False if second == "" or second is None: one = True url = first analyser = None if one == False: analyser = Analyser(first, second, third) try: encode = self.extractor.get_page_encode(first) except HttpConnectionFailedException as e: return (-2, (target, None, None, None)) for i in range(1, Configs.length_web): LogGo.info('scaning index: ' + str(i)) if one: pass else: url = analyser.get_url(i) try: raw = self.req.get(url,encode=encode) if target.type == 'ulweb': self.looper_html(detail_page_bundle_list, raw, exists, list_ruler, str_parent_container, str_list_container) elif target.type == 'jsweb': raw = self.extra4_process(target.extra4, raw) self.looper_js(detail_page_bundle_list, raw, exists, list_ruler, str_parent_container, list_json_path=list_json_path) if one: break except AttributeError as e: pass except TypeError as e: import traceback LogGo.warning(repr(e)) except HttpConnectionFailedException as e: import traceback LogGo.warning(repr(e)) except Exception as e: import traceback LogGo.warning(repr(e)) if len(detail_page_bundle_list) > 0: detail_page_bundle_list.reverse() detail_page_bundle_list = self.purify(detail_page_bundle_list,'link') return (1, (target, detail_page_bundle_list, content_ruler, encode)) else: return (-1, (target, None, None, None))
def build_base_dic(self,target,list,existsUrls,order): news = TBNews() article = TBArticle() picture_dao = PictureDao() result = [] article_result = [] """抓取正文""" for i in list: try: i['content_url'] = UrlHelper.unify(i['content_url']) #StringHelper.unescape(i['content_url']) if existsUrls.count(i['content_url']) < 1: # getattr(i, 'url') LogGo.info(">>> file id: " + str(i['fileid'])) LogGo.info(">>> url: " + str(i['content_url'])) try: tup = ExtraJSON.wechat_extra_content(i['content_url']) # getattr(i, 'url') except Exception as e: print(e) print(">>> ") print(">>> extra content error.") print(">>> ") LogGo.info("extra content error.") LogGo.info("possible a deleted msg") # LogGo.info("url: " + i['content_url']) continue raw_content = tup[1] content = tup[2] """字典的 键 对应数据库中的字段名 值 对应要存储的值""" dic = dict() article_dic = dict() order = order + 5 dic[news.order_code.key] = order # """排序代码""" dic[news.create_date.key] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # """此条记录创建时间""" dic[news.valid.key] = 1 dic[news.text_not_format.key] = content #"""去除标签的正文内容""" # dic[news.text_blob.key] = raw_content #"""原始带标签字段""" dic[news.subscribe_time.key] = i['datetime'] # getattr(i, 'publicTime') """文章发表日期""" dic[news.author.key] = i['author'] # getattr(i, 'author')"""文章所属机构""" dic[news.title.key] = i['title'] # getattr(i, 'title')"""文章标题""" dic[news.subject.key] = i['digest'] # """摘要""" dic[news.status.key] = 2 picture_id = picture_dao.save_data(i['cover']) dic[news.main_pic_id.key] = picture_id #"""列表图片 id""" article_dic[article.fingerprint.key] = md5(i['content_url'])#"""由地址生成的指纹""" article_dic[article.target_id.key] = target.id article_dic[article.company.key] = target.data_key # getattr(i, 'author') """文章所属机构""" article_dic[article.content_url.key] = i['content_url'] # getattr(i, 'url')"""正文链接""" article_dic[article.scrabble_type.key] = 'wechat' # """文章类型 微信固定值为 wechat """ article_dic[article.is_scrabbled.key] = 1 # """在数据库中作为 这是一条抓取到的数据 的标记""" result.append(dic) article_result.append(article_dic) except Exception as e: import traceback msg = traceback.format_exc() print(msg) LogGo.warning(repr(e)) continue return result, article_result
def scan_detail(self, target, detail_page_bundle, order, content_ruler, encode): news = TBNews() article = TBArticle() # picture_dao = PictureDao() result_dic = dict() try: info = self.ready_info(detail_page_bundle['title'], detail_page_bundle['url']) LogGo.info(info) try: # tup = ExtraJSON.wechat_extra_content(detail_page_bundle['url']) tup = self.jsons.wechat_extra_content( detail_page_bundle['url']) except HttpConnectionFailedException as e: LogGo.warning(repr(e)) return (-3, None) except AttributeError: LogGo.warning( "Maybe a deleted msg, complete the code to detect this error" ) return (-2, None) except Exception: LogGo.warning("Error when get detail message!") return (-2, None) raw_content = tup[1] content = tup[2] picture = tup[3] """字典的 键 对应数据库中的字段名 值 对应要存储的值""" news_dic = dict() article_dic = dict() ############################## NEWS ############################### """列表图片 id""" # if picture is not None: # picture_id = picture_dao.save_data(picture) # news_dic[news.main_pic_id.key] = picture_id news_dic[news.text_not_format.key] = content # """去除标签的正文内容""" # dic[news.text_blob.key] = raw_content#"""原始带标签字段""" news_dic[news.subscribe_time.key] = detail_page_bundle[ 'publicTime'] # """文章发表日期""" news_dic[news.create_date.key] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # """此条记录创建时间""" news_dic[news.subject.key] = detail_page_bundle[ 'summary'] # """摘要""" news_dic[news.valid.key] = 1 news_dic[news.author.key] = detail_page_bundle['author'] news_dic[news.title.key] = detail_page_bundle[ 'title'] # """文章标题""" news_dic[news.status.key] = 2 order += 5 news_dic[news.order_code.key] = order # """排序代码""" ############################## ARTICLE ############################### article_dic[article.content_url.key] = detail_page_bundle[ 'url'] # getattr(i, 'url')"""正文链接""" article_dic[article.fingerprint.key] = md5( detail_page_bundle['url']) # """由地址生成的指纹""" article_dic[article.company.key] = target.data_key # """文章所属机构""" article_dic[article.target_id.key] = target.id article_dic[article.raw_click_count.key] = detail_page_bundle[ 'clicksCount'] # getattr(i, 'clicksCount')#"""阅读量""" article_dic[article.vote_up_count.key] = detail_page_bundle[ 'likeCount'] # getattr(i, 'likeCount')"""点赞数""" article_dic[article.scrabble_type. key] = 'wechat' # """文章类型 微信固定值为 wechat """ article_dic[ article.is_scrabbled.key] = 1 # """在数据库中作为 这是一条抓取到的数据 的标记""" article_dic[article.publishStatus.key] = 1 # article_dic[article.messageType.key] = random.randint(0, 1) ############################## DIC ############################### result_dic.update(news_dic) result_dic.update(article_dic) except Exception: import traceback msg = traceback.format_exc() LogGo.warning(msg) return (-1, None) return (1, result_dic)
def scan(self, target, order): result = [] type = self.td(target) url = target.extra0 #'http://ent.people.com.cn/GB/81374/index1.html' cap = None ruler = None if self.td(target) == 'i': cap = ['var tvInfoJs=', ''] url = self.iqiyi_base.format(url) ruler = 'keywords:contentKeyword;latestOrder:latestOrder;name:name;playCount:playCount;score:score;videoCount:videoCount' elif type == 'l': ruler = 'score:plist_score;comments:pcommon_count;bullets:pdm_count;like:up;hate:down;playCount:plist_play_count' url = self.letv_base.format(url, target.extra1) elif type == 't': cap = ["tlux.dispatch('$cover',", ");"] ruler = 'score:score->score;playCount:view_all_count;videoCount:episode_all;latestOrder:episode_updatedd' url = self.qq_base.format(url) elif type == 'm': url = self.mgtv_base.format(url) cap = ['"data":', ',"msg"'] ruler = 'playCount:all;like:like;hate:unlike' elif type == 'y': ruler = 'playCount:li [总播放数];comments:li [评论];like:li [顶];score:span class=star-num' if not s.is_url(url): if not url.startswith('id'): url = self.youku_prefix.format(url) url = self.youku_base.format(url) elif type == 's': url = self.sohu_base.format(url) elif type == 'c': url = self.cntv_base.format(url) ruler = 'playCount:^label [播放次数]' try: encode = ExtraHtml.get_page_encode(url) if type == 'y' or type == 'c': result = self.looper_html(url, ruler, encode, target) else: raw = RequestHelper.get(url, encode=encode) if type == 's': result = self.finder_sohu(raw) else: result = self.looper_js(raw, ruler, cap) except AttributeError as e: pass except Exception as e: import traceback msg = traceback.format_exc() print(msg) LogGo.warning(repr(e)) if len(result) > 0: result = self.build_base_dic(target, result, order) return result[0]
def build_count_dic(self, pro_list): # pc = TBProgramPlayCount result = [] try: LogGo.info(">>> count: " + str(len(pro_list))) for programs in pro_list: """字典的 键 对应数据库中的字段名 值 对应要存储的值""" dic = dict() total = 0 for program in programs: try: dic[TBProgramPlayCount.program.key] = program[ TBSoap.program.key] plantform = program[TBSoap.plantform.key] count = program[TBSoap.play_count.key] total += count if self.td(plantform) == 'i': dic[TBProgramPlayCount.count1.key] = count elif self.td(plantform) == 'l': dic[TBProgramPlayCount.count2.key] = count elif self.td(plantform) == 't': dic[TBProgramPlayCount.count3.key] = count elif self.td(plantform) == 'm': dic[TBProgramPlayCount.count4.key] = count elif self.td(plantform) == 'y': dic[TBProgramPlayCount.count5.key] = count elif self.td(plantform) == 's': dic[TBProgramPlayCount.count6.key] = count except Exception as e: import traceback msg = traceback.format_exc() LogGo.info(msg) dic[TBProgramPlayCount.total_count.key] = total dic[TBProgramPlayCount. create_time.key] = datetime.datetime.now().strftime( '%Y-%m-%d') # """此条记录创建时间""" result.append(dic) except BaseDateLackException as e: msg = "Lake improtant data(" + str(e) + ')' LogGo.warning(msg) except DataFormatException as e: pass # msg = "Date format error: " + i['link'] + '\r\n' + str(e) # LogGo.warning(msg) except KeyError as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) return result
def build_base_dic(self, target, result, order): soap = TBSoap() program_dao = ProgramDao() soap_result = [] try: name = '' if Configs.show_utf: try: name = target.data_key except: name = '<<error>>' LogGo.info(">>> name: " + str(name) + "(" + str(result['playCount']) + ")") """字典的 键 对应数据库中的字段名 值 对应要存储的值""" dic = dict() try: dic[soap.play_count.key] = result['playCount'] #瞬时播放量 except KeyError as e: raise BaseDateLackException(str(e)) try: dic[soap.keywords.key] = result['keywords'] # 关键字 except: pass try: dic[soap.bullet_count.key] = result['bullets'] # 弹幕量 except: pass try: dic[soap.hate_count.key] = result['hate'] # 怒踩量 except: pass try: dic[soap.like_count.key] = result['like'] # 点赞量 except: pass try: dic[soap.latest_order.key] = result['latestOrder'] # 最新剧集 except: pass try: dic[soap.name.key] = result['name'] # 剧名 except: pass try: dic[soap.name.key] = program_dao.get_title_by_id( target.program_id) except: pass try: dic[soap.score.key] = result['score'] # 分数 except: pass try: dic[soap.video_count.key] = result['videoCount'] # 视频数量 except: pass try: # pass dic[soap.program.key] = target.program_id # program dic[soap.target.key] = target.id # program except: pass dic[soap.plantform.key] = target.soap_type order += 1 dic[soap.order_code.key] = order # """排序代码""" dic[soap.create_date.key] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # """此条记录创建时间""" dic[soap.valid.key] = 1 soap_result.append(dic) except BaseDateLackException as e: msg = "Lake improtant data(" + str(e) + ')' LogGo.warning(msg) except DataFormatException as e: pass # msg = "Date format error: " + i['link'] + '\r\n' + str(e) # LogGo.warning(msg) except KeyError as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) return soap_result
def build_single_page_dic(self, target, detail_page_bundle, order, content_ruler, encode): news = TBNews() article = TBArticle() result_dic = dict() try: LogGo.info(WeiboRuler.url_status + detail_page_bundle['id']) # blob = i['text'].encode("UTF-8") """字典的 键 对应数据库中的字段名 值 对应要存储的值""" news_dic = dict() article_dic = dict() """排序代码""" order += 2 news_dic[news.order_code.key] = order # dic[news.text_not_format.key] = i['text']#"""去除标签的正文内容""" # dic[news.text_blob.key] = blob #"""原始带标签字段""" sub_tim = detail_page_bundle['created_at'] if sub_tim is not None: news_dic[news.subscribe_time.key] = sub_tim # getattr(i, 'publicTime') """文章发表日期""" else: LogGo.warning("no subscribe time!") news_dic[news.create_date.key] = DateGo.get_current_date() # """此条记录创建时间""" news_dic[news.status.key] = 1 # """状态""" news_dic[news.valid.key] = 1 news_dic[news.title.key] = detail_page_bundle['text'] news_dic[news.text_not_format.key] = detail_page_bundle['text'] news_dic[news.text_blob.key] = detail_page_bundle['text'] # title = None # try: # title = i['page_info'] # title = title['content1'] # except Exception as e: # pass # if title is None: # dic[news.title.key] = i['text'] # getattr(i, 'title') """文章标题""" # else: # dic[news.title.key] = title # """文章标题""" """文章所属机构""" try: user = detail_page_bundle['user'] screen_name = user['screen_name'] article_dic[article.company.key] = screen_name # getattr except Exception as e: pass article_dic[article.vote_up_count.key] = detail_page_bundle['attitudes_count'] # getattr(i, 'likeCount') """点赞数""" article_dic[article.scrabble_type.key] = 'weibo' #"""文章类型""" article_dic[article.is_scrabbled.key] = 1 #"""在数据库中作为 这是一条抓取到的数据 的标记""" article_dic[article.identifier.key] = detail_page_bundle['id'] #"""数据在母体中的 id""" article_dic[article.target_id.key] = target.id article_dic[article.content_url.key] = WeiboRuler.url_status + detail_page_bundle['id'] # getattr(i, 'url') """正文链接""" article_dic[article.publishStatus.key] = 1 # article_dic[article.messageType.key] = random.randint(0, 1) """如果是回复 或者 引用 会有被引用的微博,记录那个微博的 id""" try: retweeted_status = detail_page_bundle['retweeted_status'] ret_id = retweeted_status['id'] article_dic[article.identifier_re.key] = ret_id except Exception as e: pass """阅读量""" # dic['click_count'] = i['clicksCount'] #getattr(i, 'clicksCount') """转发数""" """评论量""" # """图片组""" # try: # pics = i['pics'] # if len(pics) > 0: # group_id = PictureDao.save_group_data(pics) # if group_id is not None: # dic['group_picture_id'] = group_id # except Exception as e: # print(e) # LogGo.warning(dic['content_url']) # LogGo.warning(e) result_dic.update(article_dic) result_dic.update(news_dic) except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) return None return result_dic