Beispiel #1
0
def get_data_from_db(sql, asin=None, data=None):
    myRedis = GetRedis().return_redis(debug_log)
    # 先从redis的product_info_asin中获取品牌数据
    print(asin)
    brand = myRedis.hget('product_info_asin', 'product_info_{}'.format(asin))
    print(brand)
    if brand:
        return brand
    print(2222)
    urlQ = UrlQueue(myRedis, debug_log)
    if type(data) is dict:
        result = urlQ.retrieve_asin(sql, data)
    else:
        result = urlQ.retrieve_asin(sql)
    return result
Beispiel #2
0
def add_the_url_to_queue(url_list, url_type):
    print('add_the_url_to_queue.url_type: ', url_type)
    debug_log = Logger()
    myRedis = GetRedis().return_redis(debug_log)
    urlQ = UrlQueue(myRedis, debug_log)
    kwQ = KeyWordQueue(myRedis, debug_log)
    theQueue = urlQ
    if url_type == 'keyword':
        theQueue = kwQ
    aid = 0
    cid_or_mt = 0
    if url_type == 'goods':
        cid_or_mt = 1
    if url_type == 'reviews':
        cid_or_mt = 3
    if url_type == 'tosell':
        cid_or_mt = 5
    url_tuple_list = []
    for url in url_list:
        url_tuple = (url, cid_or_mt, aid)
        print(url_tuple)
        url_tuple_list.append(url_tuple)

    if len(url_tuple_list) > 0:
        add_url_to_queue(theQueue, url_tuple_list, url_type=url_type)
Beispiel #3
0
    if BASE_TYPE == 'develop':
        t_num = 1
    while True:
        urllen1 = kwQ.return_keyword_len()
        urllen2 = kwQ._get_queue_len('monitorKeyword')
        urllen = urllen1 + urllen2
        if urllen < 1:
            sys.exit()
        crawlers = [
            KwCrawler(urlQ, ipQ, dataQ, cookiesQ, kwQ, info_log, debug_log)
            for i in range(t_num)
        ]
        for craw in crawlers:
            craw.start()
        for craw in crawlers:
            craw.join()


if __name__ == '__main__':
    log_name = sys.argv[0].split('/')[-1].split('.')[0]
    debug_log = Logger(log_name=log_name)
    info_log = Logger(log_level='info', log_name=log_name)
    myRedis = GetRedis().return_redis(debug_log)
    kwQ = KeyWordQueue(myRedis, debug_log)
    urlQ = UrlQueue(myRedis, debug_log)
    ipQ = IpQueue(myRedis, debug_log)
    dataQ = DataQueue(myRedis, debug_log)
    cookiesQ = CookQueue(myRedis, debug_log)
    keyword_start(urlQ, ipQ, dataQ, cookiesQ, kwQ, info_log, debug_log,
                  1)  # keyword_start()
Beispiel #4
0
def url_init():
    '''服务启动时调用一次, 定时器每天凌晨0点1分调用'''
    pstNow = return_PST()
    pstHour = pstNow.hour
    debug_log = Logger()
    # info_log = Logger('info')
    if pstHour == 0:
        myRedis = GetRedis().return_redis(debug_log)
        if get_worker_state(myRedis):
            pass
            #return debug_log.war('init_url: %s 其它机器已经初始化过了' % (pstNow.strftime('%Y-%m-%d %H:%M:%S')))
        # 重置爬取状态
        init_crawler_state()
        kwQ = KeyWordQueue(myRedis, debug_log)
        urlQ = UrlQueue(myRedis, debug_log)
        # ipQ = IpQueue(myRedis, debug_log)
        # dataQ = DataQueue(myRedis, debug_log)
        # cookiesQ = CookQueue(myRedis, debug_log)

        # 设置初始化时间戳(用来判断数据是否已爬取)
        init_updae_tm(urlQ)

        # 清空新url集合
        empty_new_url_set(urlQ)

        # 将任务总数重设为0
        urlQ.set_mission_attempts(0)

        # 清空重试统计队列
        myRedis.zremrangebylex('%s%s' % ('goods', 'fail'), '-', '+')
        myRedis.zremrangebylex('%s%s' % ('tosell', 'fail'), '-', '+')
        myRedis.zremrangebylex('%s%s' % ('reviews', 'fail'), '-', '+')
        myRedis.zremrangebylex('%s%s' % ('keyword', 'fail'), '-', '+')

        # 清空商品队列
        Qname = 'goodsUrlQueue'
        empty_url_queue(myRedis, Qname)
        # 清空评论队列
        Qname = 'reviewsUrlQueue'
        empty_url_queue(myRedis, Qname)
        # 清空跟卖队列
        Qname = 'tosellUrlQueue'
        empty_url_queue(myRedis, Qname)
        # 清空关键词队列
        Qname = 'KeywordQueue'
        empty_url_queue(myRedis, Qname)

        # 清空已添加集合
        empty_asinAndKw(urlQ)
        # 清空已成功集合
        empty_succeessUrl(urlQ)
        # 清空下载失败集合
        empty_defeatedUrl(urlQ)

        # 将结束报告状态, 设置为False
        key = 'isStatistics'
        key_type = 'statistics'
        value = pickle.dumps(False)
        urlQ._set_key_value_to_string(key, value, key_type, overtime=86400)

        # 初始化所有url
        all_url_init(urlQ, kwQ)

        # 设置完工状态
        set_worker_state(myRedis)
        # 清空url的验证码与失败统计次数
        myRedis.zremrangebylex('urlDefeatedTimes', '-', '+')
        myRedis.zremrangebylex('urlRobotCheckTimes', '-', '+')
        debug_log.war('init_url: %s 初始化完成' % (pstNow.strftime('%Y-%m-%d %H:%M:%S')))
    else:
        debug_log.war('init_url: 当前太平洋时间: %s, 不是凌晨0点' % (pstNow.strftime('%Y-%m-%d %H:%M:%S')))
Beispiel #5
0
def get_brand_from_db(debug_log):
    myRedis = GetRedis().return_redis(debug_log)
    urlQ = UrlQueue(myRedis, debug_log)
    # TODO  传入一条sql语句
    brand = urlQ.retrieve_asin('')
    return brand