def get_data_from_db(sql, asin=None, data=None): myRedis = GetRedis().return_redis(debug_log) # 先从redis的product_info_asin中获取品牌数据 print(asin) brand = myRedis.hget('product_info_asin', 'product_info_{}'.format(asin)) print(brand) if brand: return brand print(2222) urlQ = UrlQueue(myRedis, debug_log) if type(data) is dict: result = urlQ.retrieve_asin(sql, data) else: result = urlQ.retrieve_asin(sql) return result
def add_the_url_to_queue(url_list, url_type): print('add_the_url_to_queue.url_type: ', url_type) debug_log = Logger() myRedis = GetRedis().return_redis(debug_log) urlQ = UrlQueue(myRedis, debug_log) kwQ = KeyWordQueue(myRedis, debug_log) theQueue = urlQ if url_type == 'keyword': theQueue = kwQ aid = 0 cid_or_mt = 0 if url_type == 'goods': cid_or_mt = 1 if url_type == 'reviews': cid_or_mt = 3 if url_type == 'tosell': cid_or_mt = 5 url_tuple_list = [] for url in url_list: url_tuple = (url, cid_or_mt, aid) print(url_tuple) url_tuple_list.append(url_tuple) if len(url_tuple_list) > 0: add_url_to_queue(theQueue, url_tuple_list, url_type=url_type)
if BASE_TYPE == 'develop': t_num = 1 while True: urllen1 = kwQ.return_keyword_len() urllen2 = kwQ._get_queue_len('monitorKeyword') urllen = urllen1 + urllen2 if urllen < 1: sys.exit() crawlers = [ KwCrawler(urlQ, ipQ, dataQ, cookiesQ, kwQ, info_log, debug_log) for i in range(t_num) ] for craw in crawlers: craw.start() for craw in crawlers: craw.join() if __name__ == '__main__': log_name = sys.argv[0].split('/')[-1].split('.')[0] debug_log = Logger(log_name=log_name) info_log = Logger(log_level='info', log_name=log_name) myRedis = GetRedis().return_redis(debug_log) kwQ = KeyWordQueue(myRedis, debug_log) urlQ = UrlQueue(myRedis, debug_log) ipQ = IpQueue(myRedis, debug_log) dataQ = DataQueue(myRedis, debug_log) cookiesQ = CookQueue(myRedis, debug_log) keyword_start(urlQ, ipQ, dataQ, cookiesQ, kwQ, info_log, debug_log, 1) # keyword_start()
def url_init(): '''服务启动时调用一次, 定时器每天凌晨0点1分调用''' pstNow = return_PST() pstHour = pstNow.hour debug_log = Logger() # info_log = Logger('info') if pstHour == 0: myRedis = GetRedis().return_redis(debug_log) if get_worker_state(myRedis): pass #return debug_log.war('init_url: %s 其它机器已经初始化过了' % (pstNow.strftime('%Y-%m-%d %H:%M:%S'))) # 重置爬取状态 init_crawler_state() kwQ = KeyWordQueue(myRedis, debug_log) urlQ = UrlQueue(myRedis, debug_log) # ipQ = IpQueue(myRedis, debug_log) # dataQ = DataQueue(myRedis, debug_log) # cookiesQ = CookQueue(myRedis, debug_log) # 设置初始化时间戳(用来判断数据是否已爬取) init_updae_tm(urlQ) # 清空新url集合 empty_new_url_set(urlQ) # 将任务总数重设为0 urlQ.set_mission_attempts(0) # 清空重试统计队列 myRedis.zremrangebylex('%s%s' % ('goods', 'fail'), '-', '+') myRedis.zremrangebylex('%s%s' % ('tosell', 'fail'), '-', '+') myRedis.zremrangebylex('%s%s' % ('reviews', 'fail'), '-', '+') myRedis.zremrangebylex('%s%s' % ('keyword', 'fail'), '-', '+') # 清空商品队列 Qname = 'goodsUrlQueue' empty_url_queue(myRedis, Qname) # 清空评论队列 Qname = 'reviewsUrlQueue' empty_url_queue(myRedis, Qname) # 清空跟卖队列 Qname = 'tosellUrlQueue' empty_url_queue(myRedis, Qname) # 清空关键词队列 Qname = 'KeywordQueue' empty_url_queue(myRedis, Qname) # 清空已添加集合 empty_asinAndKw(urlQ) # 清空已成功集合 empty_succeessUrl(urlQ) # 清空下载失败集合 empty_defeatedUrl(urlQ) # 将结束报告状态, 设置为False key = 'isStatistics' key_type = 'statistics' value = pickle.dumps(False) urlQ._set_key_value_to_string(key, value, key_type, overtime=86400) # 初始化所有url all_url_init(urlQ, kwQ) # 设置完工状态 set_worker_state(myRedis) # 清空url的验证码与失败统计次数 myRedis.zremrangebylex('urlDefeatedTimes', '-', '+') myRedis.zremrangebylex('urlRobotCheckTimes', '-', '+') debug_log.war('init_url: %s 初始化完成' % (pstNow.strftime('%Y-%m-%d %H:%M:%S'))) else: debug_log.war('init_url: 当前太平洋时间: %s, 不是凌晨0点' % (pstNow.strftime('%Y-%m-%d %H:%M:%S')))
def get_brand_from_db(debug_log): myRedis = GetRedis().return_redis(debug_log) urlQ = UrlQueue(myRedis, debug_log) # TODO 传入一条sql语句 brand = urlQ.retrieve_asin('') return brand