def check_data_base(self): """数据库结构检查""" list = [ TBNews(), TBArticle(), TBProgram(), TBWenzhangInfo(), TBDictionaryType(), TBDictionary(), TBNewsGroup(), TBNlpFilter(), TBScrapingTarget(), TBSoap(), TBSoapTarget(), TBGlobalTarget(), TBMR(), TBSpecialTarget(), TBProgramType(), TBSoapBlackList(), TBHeavyText() ] # list = [TBSoapBlackList()] guard = StructureGuard(Configs()) log = guard.check(list) if len(log) > 0: LogGo.info(str(log))
def send_to_queue(result): """ 传送 code: 0 到达最大访问频率 1 正常结果 :param request: :param result: :return: """ global target_mutex, target_count, target_transported_count, all_target_transported code, value = result target, detail_page_bundle_list, content_ruler, encode = value if code == 1: for detail_page_bundle in detail_page_bundle_list: target_producer.target_queue.queue.put( (target, detail_page_bundle, content_ruler, encode)) ScrabingTarget.set_last_access_date(target.id) else: LogGo.error("List Page Error:" + str(target.data_key) + " Code: " + str(code)) ScrabingTarget.set_elog(target.id, "error code: " + str(code)) if target_mutex.acquire(): if target_count == target_transported_count: all_target_transported = True else: target_transported_count += 1 LogGo.debug('target_transported_count: ' + str(target_transported_count)) target_mutex.release()
def scrape_detail(self, target, detail_page_bundle, content_ruler, encode): try: order = self.web_order ulweb = UlWebRuler() result = ulweb.scan_detail(target, detail_page_bundle, order, content_ruler, encode) code, detail_page_result = result if code == 1: if detail_page_result is not None: return (1, detail_page_result) elif code == -3: self.temp_list.append( (target, detail_page_bundle, content_ruler, encode)) except WebTargetOutOfDateException as e: return (-1, e.args[0]) except Exception as e: import traceback LogGo.warning(repr(e)) return (-1, e.args[0]) return (0, None)
def scan_list(self, target, exists): list = [] result_list = [] cap = 'data' ruler = 'author:author;title:title;date:posttime;img:picurl;link:url;top:top;click:readnum_newest;vote_up:likenum_newest;subject:content' url = self.url.format(target.extra0, target.wx_hao) header = {'X-Requested-With': 'XMLHttpRequest'} raw = RequestHelper.get(url, header=header, file_cookie=Configs.gsdata_cookie_file) try: self.looper_js(list, raw, exists, ruler, cap) except Exception as e: E.out_err(e) return (-1, (target, None, None, None)) if len(list) > 0: list = self.sort(list) list.reverse() for item in list: if exists.count(item['title']) < 1: result_list.append(item) LogGo.debug('newrank list length:' + str(len(result_list))) if len(result_list) > 0: return (1, (target, list, None, None)) return(-1, (target, None, None, None))
def reset_shutdown_status(): try: file_name = Configs().system_shutdown_flag_filename FileHelper.create(file_name, "0") except: LogGo.error("error while reseting shutdown flag file!!!")
def start_mormal_mission(self): global all_target_transported target_list = get_target_list() self.target_producer = target_producer(target_list, self.config.target_pool_size, self.config.target_queue_size) self.target_consumer = target_consumer() self.upload_consumer = upload_consumer(self.config.uploader_queue_size) self.upload_consumer.start() self.target_consumer.start() self.target_producer.start() # self.target_producer.pool.close() # self.target_producer.pool.join() while True: # LogGo.debug(">>> target queue unfinishd count: " + str(self.target_producer.target_queue.queue.unfinished_tasks)) time.sleep(5) LogGo.debug("target_transported_over: " + str(target_producer.is_all_target_transported())) # self.target_consumer.queue.queue.join() # time.sleep(6000) LogGo.info('Loop Done! task count: ' + str(len(target_list))) SMTPServer.launch_mission_report()
def newrank_detail(self, target, detail_page_bundle, content_ruler, encode): LogGo.info("On newrank detail: " + str(target.data_key)) try: order = self.wechat_order newrank = NewrankRuler() result = newrank.scan_detail(target, detail_page_bundle, order, content_ruler, encode) code, detail_page_result = result if code == 1: if detail_page_result is not None: return (1, detail_page_result) elif code == -3: self.temp_list.append( (target, detail_page_bundle, content_ruler, encode)) except Exception: LogGo.warning('error in newrank detail') return (-1, None) return (0, None)
def sogou_transfor(self): LogGo.info('搜狗转移') result = 0 # sogou = SougouTransforRuler() # # existsUrls = self.news.get_all_title() # order = self.news.get_max_order_code('wechat') # # news, article = sogou.ExtraList(existsUrls, order) # # content = '搜狗转移任务(' + ')' + '\r\n' # content += '此次采集数量: ' + str(len(news)) + '\r\n' # # if self.store(news, article): # result = 1 # else: # result = -1 # # if result == 1: # content += '存储成功!' # # SMTPServer.build_mission_report(content) # print(content) return result
def store_soap_target(self, targets: [], banned: bool = False): status = False element = 0 for target in targets: try: if banned: self.banned_program.save(target) else: self.soap_target.save(target) element += 1 except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) LogGo.info('Total :' + str(element) + ' / ' + str(len(targets)) + ' elements Saved!') if element == 0: status = False else: status = True return status
def gs_detail(self, target, detail_page_bundle, content_ruler, encode): LogGo.info("On gs detail: " + str(target.data_key)) try: order = self.wechat_order gs = GsdataRuler() result = gs.scan_detail(target, detail_page_bundle, order, content_ruler, encode) code, detail_page_result = result if code == 1: if detail_page_result is not None: return (1, detail_page_result) elif code == 3: self.temp_list.append( (target, detail_page_bundle, content_ruler, encode)) except Exception as e: import traceback LogGo.warning(repr(e)) return (-1, e.args[0]) return (0, None)
def scan_list(self, target, exists): self.limited_forward_count = target.limited_forward_count self.limited_attitude_count = target.limited_attitude_count list = [] result_list = [] """模拟登陆""" status = 'you got it' """如果登陆成功""" if status != '': self.loops(target,exists,list) if len(list) < 1: return (0, (target, None, None, None)) else: LogGo.warning("Weibo: Loop scan faild!") return (-1, (target, None, None, None)) if len(list) > 0: list = self.purify(list) list.reverse() for item in list: if exists.count(item['id']) < 1: result_list.append(item) LogGo.debug('newrank list length:' + str(len(result_list))) if len(result_list) > 0: return (1, (target, list, None, None)) return(-1, (target, None, None, None))
def set_shutdown_status(): try: file_name = Configs().system_shutdown_flag_filename FileHelper.create(file_name, "1") except: LogGo.error("something wrong at setting shutdown flag file !!!")
def purify(self,list): if len(list) < 1: return [] result = [] for i in list: try: flag = True id = i['id'] text = i['text'] limited_attitude_count = i['attitudes_count'] limited_forward_count = i['reposts_count'] program_count = 0 # 基础过滤重复id for seq in result[::-1]: sid = seq['id'] if id == sid: flag = False break # 第一次节目名过滤(有可能会包含到非节目) if flag and text.count('《') < 1: flag = False # 条二次参数过滤 if flag and self.limited_attitude_count is not None and limited_attitude_count is not None: if limited_attitude_count < self.limited_attitude_count: flag = False if flag and self.limited_forward_count is not None and limited_forward_count is not None: if limited_forward_count < self.limited_forward_count: flag = False # 第三次依据节目名过滤 if flag: for program in self.exist_program: if text.count(program) >= 1: program_count = program_count + 1 if program_count > 3: flag = False break if flag: result.append(i) except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) return result
def base_init(): LogGo.init(Configs()) RequestHelper.init(Configs()) SMTPServer.init(Configs()) Download(Configs()) RequestHelperClassVer.init(Configs()) ProxyHelper.init(Configs()) MysqlHelper.init(Configs()) BaseStrategy.init()
def ExtraList(self, target, existsUrls, order): # order = ScrappdeDataDao.get_max_order_code() # 数据库中排序代码 result = [] url = str(target.extra0) next_index = "" """抓取地址""" raw = WechatRuler.req._get(url) try: trup = ExtraJSON.extraWechatList(raw, 'msgList', WechatRuler.keys) list = trup[0] next_index = str(trup[1]) except Exception as e: print(e) print("ERROR") return result while True: try: print('>>> scaning id: ' + next_index) LogGo.info('>>> scaning id: ' + next_index) tup = self.loopToFail(url, next_index) re_list = tup[0] next_index = str(tup[1]) is_continue = tup[2] if len(re_list) > 0: for item in re_list: list.append(item) # break else: break if is_continue != 1: break except Exception as e: print(e) break print('>>> list scaning completed') print('>>>') list.reverse() print('>>> Start Build SQL') result = self.build_base_dic(target,list,existsUrls,order) print('>>> Build SQL Success') print('>>>') return result
def start(self): """ 开启线程池,把所有 Target 放入池中,单独获取抓取列表 :return: """ while True: LogGo.info("Start target pool") [ self.pool.apply_async(target_producer.worker, (target, )) for target in self.targets ] time.sleep(Configs().work_interval)
def check_shutdown_status(): try: file_name = Configs().system_shutdown_flag_filename status = int(StringHelper.trim(FileHelper.read(file_name))) if status == 1: somebody_help.reset_shutdown_status() return True else: return False except: LogGo.error("system_shutdown_flag_file unavailable!") return False
def store_program(self, ids, programs): status = False if len(ids) != len(programs): LogGo.error("ids count unmatch programs count") return False count = len(ids) element = 0 if count > 0: for id, program in zip(ids, programs): try: self.program.save(program, id) element += 1 except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) LogGo.info('Total :' + str(count) + ' / ' + str(element) + ' elements Saved!') if element == 0: status = False else: status = True else: LogGo.info("0 Element!") return status
def start(self): try: if self.config.check_table: self.check_data_base() # RequestHelperClassVer.init(self.config) # ProxyHelper.init(self.config) # MysqlHelper.init(self.config) self.start_mormal_mission() except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg)
def newrank_list(self, target): LogGo.info("On newrank list: " + str(target.data_key)) try: exists = self.exists_title newrank = NewrankRuler() code, value = newrank.scan_list(target, exists) return (code, value) except Exception as e: import traceback LogGo.warning(repr(e)) return (0, None)
def gs_list(self, target): LogGo.info("On gs list: " + str(target.data_key)) try: exists = self.exists_signature gs = GsdataRuler() code, value = gs.scan_list(target, exists) return (code, value) except Exception as e: import traceback LogGo.warning(repr(e)) return (0, None)
def weibo_list(self, target): LogGo.info("On weibo list: " + str(target.data_key)) try: exists = self.exists_identifier weibo = WeiboRuler() code, value = weibo.scan_list(target, exists) return (code, value) except Exception as e: import traceback LogGo.warning(repr(e)) return (0, None)
def store(self, news_list: [], article_list: [], heavy_list: [] = None): status = False if len(article_list) != len(news_list): LogGo.error("news count unmatch article count") return False count = len(news_list) element = 0 if count > 0: if heavy_list is not None: for news, article, heavy in zip(news_list, article_list, heavy_list): try: id = genUUID() self.news.save(news, id) self.article.save(article, id) self.heavy.save_with_news_id(heavy, id) element += 1 except Exception as e: LogGo.warning(repr(e)) else: for news, article in zip(news_list, article_list): try: id = genUUID() self.news.save(news, id) self.article.save(article, id) element += 1 except Exception as e: LogGo.warning(repr(e)) LogGo.info('Total :' + str(count) + ' / ' + str(element) + ' elements Saved!') if element == 0: status = False else: status = True # return True else: LogGo.info("0 Element!") # return False return status
def start(self): result = 0 try: LogGo.info('搜狗爬虫') # Updatemp.loot() return result except Exception as e: import traceback msg = traceback.format_exc() print(msg) LogGo.warning(e) result = -1 return result
def weibo_detail(self, target, detail_page_bundle, content_ruler, encode): LogGo.info("On weibo detail: " + str(target.data_key)) try: order = self.weibo_order weibo = WeiboRuler() detail_page_result_dic = weibo.scan_detail(target, detail_page_bundle, order, content_ruler, encode) if detail_page_result_dic is not None: return (1, detail_page_result_dic) except Exception as e: import traceback LogGo.warning(repr(e)) return (-1, e.args[0]) return (0, None)
def scan_list(self, target, exists): """请求参数""" par = (['flag', 'true'], ['uuid', target.extra0]) """抓取关键字""" keys = [ 'title', 'author', 'publicTime', 'url', 'clicksCount', 'likeCount', 'publicTime', 'summary' ] list = [] result_list = [] try: raw = RequestHelper.post(NewrankRuler.url, par, file_cookie=Configs.newrank_cookie_file) except Exception as e: import traceback msg = traceback.format_exc() # print(msg) LogGo.warning(msg) return (-1, (target, None, None, None)) try: list = ExtraJSON.extra_newrank_wechat_list(raw, keys) except: return (-1, (target, None, None, None)) if len(list) > 0: list.reverse() for item in list: if exists.count(item['title']) < 1: result_list.append(item) LogGo.debug('newrank list length:' + str(len(result_list))) if len(result_list) > 0: return (1, (target, list, None, None)) return (-1, (target, None, None, None))
def store_program_type(self, pair): status = False count = len(pair) element = 0 if count > 0: for item in pair: try: program, type = item self.program_type.save_by_program_type(program, type) element += 1 except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) LogGo.info('Total :' + str(count) + ' / ' + str(element) + ' elements Saved!') if element == 0: status = False else: status = True else: LogGo.info("0 Element!") return status
def store_count(self, list): status = False count = len(list) element = 0 update = 0 if count > 0: for program in list: try: if self.check_for_exists(program) == 1: update += 1 else: id = genUUID() self.pc.save(program, id) element += 1 except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) LogGo.info('Total :' + str(count) + ' (' + str(element) + ' Saved, ' + str(update) + ' Updated)') if element == 0 and update == 0: status = False else: status = True else: LogGo.info("0 Element!") # return False return status
def gensto(self, dao, list): status = False count = len(list) element = 0 if count > 0: for item in list: try: id = genUUID() dao.save_or_update(item, id) element += 1 except Exception as e: import traceback msg = traceback.format_exc() # print(msg) LogGo.warning(repr(e)) LogGo.info('Total :' + str(count) + ' / ' + str(element) + ' elements Saved!') if element == 0: status = False else: status = True else: LogGo.info("0 Element!") stauts = False return status
def genup(self, dao, list): status = False count = len(list) element = 0 if count > 0: for (update, where) in list: try: dao.update(update, where) element += 1 except Exception as e: import traceback msg = traceback.format_exc() # print(msg) LogGo.warning(repr(e)) LogGo.info('Total :' + str(count) + ' / ' + str(element) + ' elements Updated!') if element == 0: status = False else: status = True else: LogGo.info("0 Element!") stauts = False return status