Exemple #1
0
 def get_redis_time():
     debug_log = Logger()
     myRedis = GetRedis().return_redis(debug_log)
     time_tuple = myRedis.time()
     print(time_tuple)
     timestr = '%s.%s' % (time_tuple[0], time_tuple[1])
     print(timestr)
     times = int(float(timestr) * 1000)
     # print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(times/1000)))
     return times
Exemple #2
0
def get_brand_from_api(asin):
    brand = ''
    api_url = GET_ASIN_DATA_API.format(asin)
    data_dict = get_data_from_api(api_url)
    if type(data_dict) is dict:
        data = data_dict.get('data', {})
        # TODO 将数据保存到redis中  以product_info_asin为键  hash格式
        myRedis = GetRedis().return_redis(debug_log)
        myRedis.hset('product_info_asin', 'product_info_{}'.format(asin), data)
        print(data)
        brand = data.get('brand', '')
    return brand
Exemple #3
0
def get_data_from_db(sql, asin=None, data=None):
    myRedis = GetRedis().return_redis(debug_log)
    # 先从redis的product_info_asin中获取品牌数据
    print(asin)
    brand = myRedis.hget('product_info_asin', 'product_info_{}'.format(asin))
    print(brand)
    if brand:
        return brand
    print(2222)
    urlQ = UrlQueue(myRedis, debug_log)
    if type(data) is dict:
        result = urlQ.retrieve_asin(sql, data)
    else:
        result = urlQ.retrieve_asin(sql)
    return result
Exemple #4
0
def add_the_url_to_queue(url_list, url_type):
    print('add_the_url_to_queue.url_type: ', url_type)
    debug_log = Logger()
    myRedis = GetRedis().return_redis(debug_log)
    urlQ = UrlQueue(myRedis, debug_log)
    kwQ = KeyWordQueue(myRedis, debug_log)
    theQueue = urlQ
    if url_type == 'keyword':
        theQueue = kwQ
    aid = 0
    cid_or_mt = 0
    if url_type == 'goods':
        cid_or_mt = 1
    if url_type == 'reviews':
        cid_or_mt = 3
    if url_type == 'tosell':
        cid_or_mt = 5
    url_tuple_list = []
    for url in url_list:
        url_tuple = (url, cid_or_mt, aid)
        print(url_tuple)
        url_tuple_list.append(url_tuple)

    if len(url_tuple_list) > 0:
        add_url_to_queue(theQueue, url_tuple_list, url_type=url_type)
Exemple #5
0
    def wrap(*args, **kwargs):
        R = GetRedis().return_redis(None)
        def get_lock(*args, **kwargs):
            proxy = args[0] if len(args) > 0 and type(args[0]) is dict else kwargs
            print(proxy)
            print(kwargs)
            if 'proxy.crawlera.com' in proxy.get('proxies', {}).get('https', ''):
                # 从crawler1集合(待取锁), 获取锁
                the_lock = R.spop('crawler1')
                print(the_lock)
                if the_lock:
                    # 加入crawler2集合(在用锁)后, 增加使用状态(用来检查锁是否活跃), 返回锁
                    if R.sadd('crawler2', the_lock):
                        if R.set(the_lock, 'used', 60):
                            return the_lock
            elif 'cld-us-dxig.tp-ns.com' in proxy.get('proxies', {}).get('https', ''):
                # 从dxig1集合(待取锁), 获取锁
                the_lock = R.spop('dxig1')
                # 如果锁用过, 返回空.
                if R.get(the_lock):
                    return None
                else:
                    # 标记锁(此proxy每分钟不能超过600次请求, 所以每分钟每个锁只能用一次), 后返回锁.
                    if R.set(the_lock, 'used', 60):
                        return the_lock
            else:
                pass

        def rele_lock(*args, **kwargs):
            proxy = args[0] if len(args) > 0 and type(args[0]) is dict else kwargs
            if 'proxy.crawlera.com' in proxy.get('proxies', {}).get('https', ''):
                # 用完后, 将所从crawler2(在用锁)删除, 并加回crawler1(待取锁)
                if R.srem('crawler2', kwargs.get('the_lock')):
                    return R.sadd('crawler1', kwargs.get('the_lock'))
            elif 'cld-us-dxig.tp-ns.com' in proxy.get('proxies', {}).get('https', ''):
                pass
            else:
                pass

        i = 0
        while 1:
            i += 1
            the_lock = get_lock(*args, **kwargs)  # 获取锁
            print(the_lock)
            if the_lock:
                break
            elif i == 100:
                break
            else:
                print('restart get locak')
                time.sleep(0.1)
        if the_lock:
            result = func(*args, **kwargs)
            # 还回锁, 再返回值
            kwargs['the_lock'] = the_lock
            rele_lock(*args, **kwargs)
            return result
        else:
            raise Exception('get asycn lock timeout')
Exemple #6
0
    if BASE_TYPE == 'develop':
        t_num = 1
    while True:
        urllen1 = kwQ.return_keyword_len()
        urllen2 = kwQ._get_queue_len('monitorKeyword')
        urllen = urllen1 + urllen2
        if urllen < 1:
            sys.exit()
        crawlers = [
            KwCrawler(urlQ, ipQ, dataQ, cookiesQ, kwQ, info_log, debug_log)
            for i in range(t_num)
        ]
        for craw in crawlers:
            craw.start()
        for craw in crawlers:
            craw.join()


if __name__ == '__main__':
    log_name = sys.argv[0].split('/')[-1].split('.')[0]
    debug_log = Logger(log_name=log_name)
    info_log = Logger(log_level='info', log_name=log_name)
    myRedis = GetRedis().return_redis(debug_log)
    kwQ = KeyWordQueue(myRedis, debug_log)
    urlQ = UrlQueue(myRedis, debug_log)
    ipQ = IpQueue(myRedis, debug_log)
    dataQ = DataQueue(myRedis, debug_log)
    cookiesQ = CookQueue(myRedis, debug_log)
    keyword_start(urlQ, ipQ, dataQ, cookiesQ, kwQ, info_log, debug_log,
                  1)  # keyword_start()
Exemple #7
0
def url_init():
    '''服务启动时调用一次, 定时器每天凌晨0点1分调用'''
    pstNow = return_PST()
    pstHour = pstNow.hour
    debug_log = Logger()
    # info_log = Logger('info')
    if pstHour == 0:
        myRedis = GetRedis().return_redis(debug_log)
        if get_worker_state(myRedis):
            pass
            #return debug_log.war('init_url: %s 其它机器已经初始化过了' % (pstNow.strftime('%Y-%m-%d %H:%M:%S')))
        # 重置爬取状态
        init_crawler_state()
        kwQ = KeyWordQueue(myRedis, debug_log)
        urlQ = UrlQueue(myRedis, debug_log)
        # ipQ = IpQueue(myRedis, debug_log)
        # dataQ = DataQueue(myRedis, debug_log)
        # cookiesQ = CookQueue(myRedis, debug_log)

        # 设置初始化时间戳(用来判断数据是否已爬取)
        init_updae_tm(urlQ)

        # 清空新url集合
        empty_new_url_set(urlQ)

        # 将任务总数重设为0
        urlQ.set_mission_attempts(0)

        # 清空重试统计队列
        myRedis.zremrangebylex('%s%s' % ('goods', 'fail'), '-', '+')
        myRedis.zremrangebylex('%s%s' % ('tosell', 'fail'), '-', '+')
        myRedis.zremrangebylex('%s%s' % ('reviews', 'fail'), '-', '+')
        myRedis.zremrangebylex('%s%s' % ('keyword', 'fail'), '-', '+')

        # 清空商品队列
        Qname = 'goodsUrlQueue'
        empty_url_queue(myRedis, Qname)
        # 清空评论队列
        Qname = 'reviewsUrlQueue'
        empty_url_queue(myRedis, Qname)
        # 清空跟卖队列
        Qname = 'tosellUrlQueue'
        empty_url_queue(myRedis, Qname)
        # 清空关键词队列
        Qname = 'KeywordQueue'
        empty_url_queue(myRedis, Qname)

        # 清空已添加集合
        empty_asinAndKw(urlQ)
        # 清空已成功集合
        empty_succeessUrl(urlQ)
        # 清空下载失败集合
        empty_defeatedUrl(urlQ)

        # 将结束报告状态, 设置为False
        key = 'isStatistics'
        key_type = 'statistics'
        value = pickle.dumps(False)
        urlQ._set_key_value_to_string(key, value, key_type, overtime=86400)

        # 初始化所有url
        all_url_init(urlQ, kwQ)

        # 设置完工状态
        set_worker_state(myRedis)
        # 清空url的验证码与失败统计次数
        myRedis.zremrangebylex('urlDefeatedTimes', '-', '+')
        myRedis.zremrangebylex('urlRobotCheckTimes', '-', '+')
        debug_log.war('init_url: %s 初始化完成' % (pstNow.strftime('%Y-%m-%d %H:%M:%S')))
    else:
        debug_log.war('init_url: 当前太平洋时间: %s, 不是凌晨0点' % (pstNow.strftime('%Y-%m-%d %H:%M:%S')))
Exemple #8
0
def get_brand_from_db(debug_log):
    myRedis = GetRedis().return_redis(debug_log)
    urlQ = UrlQueue(myRedis, debug_log)
    # TODO  传入一条sql语句
    brand = urlQ.retrieve_asin('')
    return brand