def get_redis_time(): debug_log = Logger() myRedis = GetRedis().return_redis(debug_log) time_tuple = myRedis.time() print(time_tuple) timestr = '%s.%s' % (time_tuple[0], time_tuple[1]) print(timestr) times = int(float(timestr) * 1000) # print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(times/1000))) return times
def get_brand_from_api(asin): brand = '' api_url = GET_ASIN_DATA_API.format(asin) data_dict = get_data_from_api(api_url) if type(data_dict) is dict: data = data_dict.get('data', {}) # TODO 将数据保存到redis中 以product_info_asin为键 hash格式 myRedis = GetRedis().return_redis(debug_log) myRedis.hset('product_info_asin', 'product_info_{}'.format(asin), data) print(data) brand = data.get('brand', '') return brand
def get_data_from_db(sql, asin=None, data=None): myRedis = GetRedis().return_redis(debug_log) # 先从redis的product_info_asin中获取品牌数据 print(asin) brand = myRedis.hget('product_info_asin', 'product_info_{}'.format(asin)) print(brand) if brand: return brand print(2222) urlQ = UrlQueue(myRedis, debug_log) if type(data) is dict: result = urlQ.retrieve_asin(sql, data) else: result = urlQ.retrieve_asin(sql) return result
def add_the_url_to_queue(url_list, url_type): print('add_the_url_to_queue.url_type: ', url_type) debug_log = Logger() myRedis = GetRedis().return_redis(debug_log) urlQ = UrlQueue(myRedis, debug_log) kwQ = KeyWordQueue(myRedis, debug_log) theQueue = urlQ if url_type == 'keyword': theQueue = kwQ aid = 0 cid_or_mt = 0 if url_type == 'goods': cid_or_mt = 1 if url_type == 'reviews': cid_or_mt = 3 if url_type == 'tosell': cid_or_mt = 5 url_tuple_list = [] for url in url_list: url_tuple = (url, cid_or_mt, aid) print(url_tuple) url_tuple_list.append(url_tuple) if len(url_tuple_list) > 0: add_url_to_queue(theQueue, url_tuple_list, url_type=url_type)
def wrap(*args, **kwargs): R = GetRedis().return_redis(None) def get_lock(*args, **kwargs): proxy = args[0] if len(args) > 0 and type(args[0]) is dict else kwargs print(proxy) print(kwargs) if 'proxy.crawlera.com' in proxy.get('proxies', {}).get('https', ''): # 从crawler1集合(待取锁), 获取锁 the_lock = R.spop('crawler1') print(the_lock) if the_lock: # 加入crawler2集合(在用锁)后, 增加使用状态(用来检查锁是否活跃), 返回锁 if R.sadd('crawler2', the_lock): if R.set(the_lock, 'used', 60): return the_lock elif 'cld-us-dxig.tp-ns.com' in proxy.get('proxies', {}).get('https', ''): # 从dxig1集合(待取锁), 获取锁 the_lock = R.spop('dxig1') # 如果锁用过, 返回空. if R.get(the_lock): return None else: # 标记锁(此proxy每分钟不能超过600次请求, 所以每分钟每个锁只能用一次), 后返回锁. if R.set(the_lock, 'used', 60): return the_lock else: pass def rele_lock(*args, **kwargs): proxy = args[0] if len(args) > 0 and type(args[0]) is dict else kwargs if 'proxy.crawlera.com' in proxy.get('proxies', {}).get('https', ''): # 用完后, 将所从crawler2(在用锁)删除, 并加回crawler1(待取锁) if R.srem('crawler2', kwargs.get('the_lock')): return R.sadd('crawler1', kwargs.get('the_lock')) elif 'cld-us-dxig.tp-ns.com' in proxy.get('proxies', {}).get('https', ''): pass else: pass i = 0 while 1: i += 1 the_lock = get_lock(*args, **kwargs) # 获取锁 print(the_lock) if the_lock: break elif i == 100: break else: print('restart get locak') time.sleep(0.1) if the_lock: result = func(*args, **kwargs) # 还回锁, 再返回值 kwargs['the_lock'] = the_lock rele_lock(*args, **kwargs) return result else: raise Exception('get asycn lock timeout')
if BASE_TYPE == 'develop': t_num = 1 while True: urllen1 = kwQ.return_keyword_len() urllen2 = kwQ._get_queue_len('monitorKeyword') urllen = urllen1 + urllen2 if urllen < 1: sys.exit() crawlers = [ KwCrawler(urlQ, ipQ, dataQ, cookiesQ, kwQ, info_log, debug_log) for i in range(t_num) ] for craw in crawlers: craw.start() for craw in crawlers: craw.join() if __name__ == '__main__': log_name = sys.argv[0].split('/')[-1].split('.')[0] debug_log = Logger(log_name=log_name) info_log = Logger(log_level='info', log_name=log_name) myRedis = GetRedis().return_redis(debug_log) kwQ = KeyWordQueue(myRedis, debug_log) urlQ = UrlQueue(myRedis, debug_log) ipQ = IpQueue(myRedis, debug_log) dataQ = DataQueue(myRedis, debug_log) cookiesQ = CookQueue(myRedis, debug_log) keyword_start(urlQ, ipQ, dataQ, cookiesQ, kwQ, info_log, debug_log, 1) # keyword_start()
def url_init(): '''服务启动时调用一次, 定时器每天凌晨0点1分调用''' pstNow = return_PST() pstHour = pstNow.hour debug_log = Logger() # info_log = Logger('info') if pstHour == 0: myRedis = GetRedis().return_redis(debug_log) if get_worker_state(myRedis): pass #return debug_log.war('init_url: %s 其它机器已经初始化过了' % (pstNow.strftime('%Y-%m-%d %H:%M:%S'))) # 重置爬取状态 init_crawler_state() kwQ = KeyWordQueue(myRedis, debug_log) urlQ = UrlQueue(myRedis, debug_log) # ipQ = IpQueue(myRedis, debug_log) # dataQ = DataQueue(myRedis, debug_log) # cookiesQ = CookQueue(myRedis, debug_log) # 设置初始化时间戳(用来判断数据是否已爬取) init_updae_tm(urlQ) # 清空新url集合 empty_new_url_set(urlQ) # 将任务总数重设为0 urlQ.set_mission_attempts(0) # 清空重试统计队列 myRedis.zremrangebylex('%s%s' % ('goods', 'fail'), '-', '+') myRedis.zremrangebylex('%s%s' % ('tosell', 'fail'), '-', '+') myRedis.zremrangebylex('%s%s' % ('reviews', 'fail'), '-', '+') myRedis.zremrangebylex('%s%s' % ('keyword', 'fail'), '-', '+') # 清空商品队列 Qname = 'goodsUrlQueue' empty_url_queue(myRedis, Qname) # 清空评论队列 Qname = 'reviewsUrlQueue' empty_url_queue(myRedis, Qname) # 清空跟卖队列 Qname = 'tosellUrlQueue' empty_url_queue(myRedis, Qname) # 清空关键词队列 Qname = 'KeywordQueue' empty_url_queue(myRedis, Qname) # 清空已添加集合 empty_asinAndKw(urlQ) # 清空已成功集合 empty_succeessUrl(urlQ) # 清空下载失败集合 empty_defeatedUrl(urlQ) # 将结束报告状态, 设置为False key = 'isStatistics' key_type = 'statistics' value = pickle.dumps(False) urlQ._set_key_value_to_string(key, value, key_type, overtime=86400) # 初始化所有url all_url_init(urlQ, kwQ) # 设置完工状态 set_worker_state(myRedis) # 清空url的验证码与失败统计次数 myRedis.zremrangebylex('urlDefeatedTimes', '-', '+') myRedis.zremrangebylex('urlRobotCheckTimes', '-', '+') debug_log.war('init_url: %s 初始化完成' % (pstNow.strftime('%Y-%m-%d %H:%M:%S'))) else: debug_log.war('init_url: 当前太平洋时间: %s, 不是凌晨0点' % (pstNow.strftime('%Y-%m-%d %H:%M:%S')))
def get_brand_from_db(debug_log): myRedis = GetRedis().return_redis(debug_log) urlQ = UrlQueue(myRedis, debug_log) # TODO 传入一条sql语句 brand = urlQ.retrieve_asin('') return brand