Exemple #1
0
    def __init__(self, queue, queue_name, p_action, p_sleep_time, c_max_num,
                 c_max_sleep_time, c_max_retry_num):
        '''
        初始化数据
        :param queue:             Queue对象,往该对象里放数据
        :param queue_name:        队列名称,每个业务有自己的队列, 可以通过队列名称区分业务
        :param p_action:          具体业务的ProducerAction对象
        :param p_sleep_time:      生产一次后,下次生产的休眠间隔时间
        :param c_max_num:         最大的消费线程数,初始化多少个消费线程取决于该值
        :param c_max_sleep_time:  消费者线程消费完后到下次消费时的休眠间隔时间
        :param c_max_retry_num:   每个ConsumerAction对象实例如果消费失败了,可以重试,
                                  配置的最大重试次数
        '''
        # 1)主动调用父类的__init__()
        super(self.__class__, self).__init__()

        # 2) 初始化参数
        self.queue = queue
        self.queue_name = queue_name
        self.p_action = p_action
        self.p_sleep_time = p_sleep_time
        self.c_max_num = c_max_num
        self.c_max_sleep_time = c_max_sleep_time
        self.c_max_try_num = c_max_retry_num

        # 3)校验p_action的有效性
        if not isinstance(p_action, ProducerAction):
            raise Exception("%s is not ProducerAction instance!" % p_action)

        # 4)初始化日志对象
        self.thread_name = '%s_producer' % self.queue_name
        self.logger = LogUtil().get_logger(self.thread_name, self.thread_name)
Exemple #2
0
    def __init__(self, queue, q_name, p_action, p_sleep_time, c_max_num,
                 c_max_sleep_time, c_max_retry_num):
        '''

        :param queue:       队列对象
        :param q_name:       队列名称
        :param p_action:      生产动作对象
        :param p_sleep_time:    每次生产后的休眠时间
        :param c_max_num:       消费者的最大线程数
        :param c_max_sleep_time:  每次运行后的最大休眠时间
        :param c_max_retry_num:    运行失败后的最大重试次数
        :return:
        '''

        super(self.__class__, self).__init__()
        self.queue = queue
        self.q_name = q_name
        self.p_action = p_action
        self.p_sleep_time = p_sleep_time
        self.c_max_num = c_max_num
        self.c_max_sleep_time = c_max_sleep_time
        self.c_max_retry_num = c_max_retry_num

        # 校验p_action是不是ProducerAction的子类的实例对象
        if not isinstance(self.p_action, ProducerAction):
            raise Exception("%s is not ProducerAction instance" %
                            self.p_action)

        # 初始化日志对象
        self.logger = LogUtil().get_logger('producer_%s' % self.q_name,
                                           'producer_%s' % self.q_name)
 def __init__(self, url, param, queue_id, pro_flag):
     ConsumerAction.__init__(self)
     self.url = url[:-1] if url.endswith('/') else url
     self.param = param
     self.queue_id = queue_id
     self.pro_flag = pro_flag
     self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)
Exemple #4
0
 def __init__(self, kafka_conf):
     host_list = [host for host in kafka_conf['HOST'].split(',')]
     random.shuffle(host_list)
     host_str = ','.join(host_list)
     self.cache_key = '_'.join((host_str, kafka_conf['TOPIC']))
     self.host = host_str
     self.topic = kafka_conf['TOPIC']
     self.rl = LogUtil().get_logger('consumer', 'consumer_kafka')
    def __init__(self, text):
        '''
        初始化消费者实现类

        :param text:        消费者要处理的数据
        :return:
        '''
        super(self.__class__, self).__init__()
        self.text = text
        self.rl = LogUtil().get_base_logger()
Exemple #6
0
def put_inner_to_queue():
    redis_util = RedisUtill()
    '''

    '''
    page_show_num = 10
    # 统计hainiu_queue 未处理的记录数
    select_queue_count_sql = """
    select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0;
    """
    # 插入hainiu_queue表
    insert_queue_sql = """
    insert into hainiu_queue (type,action,params) values (%s, %s, %s);
    """

    logger = LogUtil().get_logger("download_news_queue", "download_news_queue")
    db_util = DBUtil(_HAINIU_DB)
    db_util.execute_no_commit("set NAMES utf8mb4;")
    try:
        # 统计hainiu_queue 未处理的记录数
        sql_params = [2]
        res1 = db_util.read_one(select_queue_count_sql, sql_params)
        queue_count = res1[0]
        start_time = time.time()
        if queue_count >= 5:
            logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count)
            return None
        inner_count = 0
        for ip in ips:
            key_list = []
            scan_limit_to_queue_table(ip, port, 0, 'down:*', 20, key_list)

            inner_count = inner_count + len(key_list)
            # 根据key列表上Redis里获取value列表
            values = redis_util.get_values_batch_keys(key_list)
            # 导入hainiu_queue表
            insert_queue_record = []
            for value in values:
                queue_param = json.loads(value)
                a_url = queue_param['a_url']
                insert_queue_record.append((2, a_url, value))

            db_util.executemany_no_commit(insert_queue_sql,
                                          insert_queue_record)
            db_util.commit()
            # 把导入表后的key列表从redis里删掉
            redis_util.delete_batch(key_list)

        end_time = time.time()
        run_time = end_time - start_time
        logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time))

    except Exception, e:
        traceback.print_exc(e)
        db_util.rollback()
    def __init__(self, limit, fail_times):
        '''
        初始化队列的发者

        :param limit:           每次从队列中取多少条记录
        :param fail_times:      限定取记录的失败次数条件
        '''
        super(self.__class__, self).__init__()
        self.limit = limit
        self.fail_times = fail_times
        self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)
    def __init__(self, id, ac, params):
        '''
        初始化队列的消费者

        :param id:          消息的ID,也就是数据库表里的ID
        :param ac:          消息的动作信息,也就是数据库表里的action字段
        :param params:      消息的动作的附加参数
        '''
        super(self.__class__, self).__init__()
        self.id = id
        self.ac = ac
        self.params = params
        self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)
Exemple #9
0
def push_queue_items():
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    insert_news_seed_internally_queue_items_sql = """
        insert into hainiu_queue (type,action,params) values(3,%s,%s);
    """
    count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;"""
    selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;"""
    update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last download_page queue not finish,last queue %s unFinish' % (queue_total))
            return


        starttime = time.clock()
        d = DBUtil(config._HAINIU_DB)
        total = long(d.read_one(count_news_seed_internally_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = selec_news_seed_internally_sql % (0, page_size)
            list = d.read_tuple(sql)
            values = []
            id_values = []
            for l in list:
                url = l[0]
                url = url if url is not None else ''
                param = l[1]
                param = param if param is not None else ''
                values.append((url,param))
                id = l[2]
                id_values.append(str(id))
            if id_values.__len__() != 0:
                random.shuffle(values)
                d.executemany_no_commit(insert_news_seed_internally_queue_items_sql,values)
                ids = ','.join(id_values)
                sql = update_news_seed_internally_sql % (ids)
                d.execute(sql)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('push seed_internally queue finish,total items %s,action time %s\'s' % (total,worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
def push_queue_items():
    count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;"""
    select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;"""
    insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);"""
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last news_find queue not finish,last queue %s unFinish' %
                    (queue_total))
            return

        starttime = time.clock()
        total = long(d.read_one(count_news_seed_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = select_news_seed_sql % (i * page_size, page_size)
            list = d.read_tuple(sql)
            values = []
            for l in list:
                url = l[0]
                publisher = get_tld(url)
                publisher = publisher[0:publisher.index((
                    '.'))] if publisher.__contains__('.') else publisher
                param = {}
                param['category'] = l[1]
                param['publisher'] = publisher
                param = json.dumps(param, ensure_ascii=False)
                values.append((url, param))

            if values.__len__() != 0:
                random.shuffle(values)
                d.executemany(insert_news_seed_queue_items_sql, values)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push news_find queue finish,total items %s,action time %s\'s' %
            (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
    def __init__(self, queue, name, sleep_time, work_try_num):
        '''
        初始化消费线程

        :param queue:           使用的队列
        :param name:            消费者线程的名称,用其代表消费者的名字
        :param sleep_time:      执行下一次消费动作时休息的时间
        :param work_try_num:    每个消费动作允许失败的次数
        '''
        super(self.__class__, self).__init__()
        self.queue = queue
        self.name = name
        self.sleep_time = sleep_time
        self.work_try_num = work_try_num
        Consumer._WORK_TRY_NUM = work_try_num
        self.rl = LogUtil().get_logger(
            'consumer', 'consumer' + self.name[:self.name.find("_")])
Exemple #12
0
 def send_sms(self, content, phone=config._ALERT_PHONE):
     """send alter sms for phone with content
     """
     l = LogUtil().get_base_logger()
     try:
         send_url = 'http://send.sms.hainiu.com:8080/s?command=cralwer&phone=%s&' % (
             phone)
         send_url += urllib.urlencode(
             {'content': content.decode('utf-8').encode('gbk')})
         r = urllib2.urlopen(send_url).read()
         if '0-OK' != r:
             l.error("短信发送失败,短信服务器返回状态为:%s,手机号:%s,内容:%s" %
                     (r, phone, content))
             return False
     except:
         l.exception()
         return False
     return True
Exemple #13
0
    def __init__(self, queue, thread_name, max_sleep_time, max_retry_num):
        '''

        :param queue:       队列对象
        :param thread_name:  消费线程名称
        :param sleep_time:  每次消费后的休眠时间
        :param max_retry_num:   每次失败后最多的重试次数
        :return:
        '''
        # 调用父类初始化对象,这样才能运行run方法
        super(self.__class__, self).__init__()

        self.queue = queue
        self.thread_name = thread_name
        self.max_sleep_time = max_sleep_time
        self.max_retry_num = max_retry_num

        # 初始化日志
        self.logger = LogUtil().get_logger(self.thread_name, self.thread_name)
Exemple #14
0
    def __init__(self, queue, thread_name, max_sleep_time, max_retry_num):
        '''
        初始化数据
        :param queue:          Queue对象,从该对象中获取要消费的对象
        :param thread_name:    线程名称,在线程中打印日志
        :param max_sleep_time: 消费完后到下次消费时的休眠间隔时间
        :param max_retry_num:  每个ConsumerAction对象实例如果消费失败了,可以重试,
                               配置的最大重试次数
        '''

        # 1)主动调用父类的__init__()
        super(self.__class__, self).__init__()

        # 2) 初始化参数
        self.queue = queue
        self.thread_name = thread_name
        self.max_sleep_time = max_sleep_time
        self.max_retry_num = max_retry_num

        # 3)初始化日志对象
        self.logger = LogUtil().get_logger(thread_name, thread_name)
def push_queue_items():
    inert_sql = """
    insert into hainiu_queue (type,params,action) values(1,%s,%s);
    """
    count_sql = """
    select count(1) from hainiu_queue where type=1;
    """
    select_sql = """
    select id from hainiu_queue where type=1 limit %s,%s;
    """
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        sql = inert_sql
        insert_list = [("aaa", "bbb"), ("dffddf", "awwee")]
        d.executemany(sql, insert_list)

        sql = count_sql
        queue_total = d.read_one(sql)[0]
        print "queue_total", queue_total
        page_size = 10
        page = (queue_total / page_size) + 1
        print "page", page

        for i in range(0, page):
            sql = select_sql % (i * page_size, page_size)
            select_list = d.read_tuple(sql)
            print "page", i
            for record in select_list:
                id = record[0]
                print id

    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
Exemple #16
0
    def __init__(self,queue,action,name,max_num,sleep_time,work_sleep_time,work_try_num):
        '''
        初始化生产线程

        :param queue:           使用的队列
        :param action:          生产者动作
        :param name:            生产者名称
        :param max_num:         启动的消费者的数量
        :param sleep_time:      执行下一次生产动作时休息的时间
        :param work_sleep_time: 每个消费者的休息时间
        :param work_try_num:    每个消费动作允许失败的次数
        '''
        super(self.__class__,self).__init__()
        self.queue = queue
        self.action = action
        self.name = name
        self.max_num = max_num
        self.sleep_time = sleep_time
        self.work_sleep_time = work_sleep_time
        self.work_try_num = work_try_num
        self.rl = LogUtil().get_logger('producer','producer' + self.name)
        if not isinstance(self.action,base_producer_action.ProducerAction):
            raise Exception('Action not Producer base')
Exemple #17
0
def create_seed():
    sql = """
    insert into web_seed (url,md5,domain,host,category,status) values
    ('%s','%s','%s','%s','%s',0);
    """
    url = "https://news.sina.com.cn/"
    catetory = "新闻"
    hu = HtmlUtil()
    domain = get_tld(url)
    host = hu.get_url_host(url)
    u = Util()
    md5 = u.get_md5(url)

    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._ZZ_DB)
        sql = sql % (url, md5, domain, host, catetory)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
Exemple #18
0
def create_seed():
    url = "https://www.autohome.com.cn/all"
    catetory = "汽车"
    sql = """
    insert into hainiu_web_seed (url,md5,domain,host,category,status) values
    ('%s','%s','%s','%s','%s',0);
    """
    hu = HtmlUtil()
    domain = get_tld(url)
    host = hu.get_url_host(url)
    u = Util()
    md5 = u.get_md5(url)

    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        sql = sql % (url, md5, domain, host, catetory)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
    def action(self):
        logger = LogUtil().get_logger("download_action", "download_action")
        #1)把队列中的url的HTML内容下载到文件中,每个消费线程每隔5分钟生成一个新的文件。
        r = RequestUtil()
        # hu = HtmlUtil()
        u = Util()
        db_util = DBUtil(_HAINIU_DB)
        time_util = TimeUtil()
        # 通过phandomjs 请求url,返回网页,包括网页的ajax请求
        html = r.http_get_phandomjs(self.act)
        # 拼接要写入的内容
        html = html.replace("\r", "").replace("\n", "\002")
        str1 = self.act + "\001" + html
        str2 = u.get_md5(str1) + "\001" + str1
        # 成功失败标记
        is_success = True
        # 获取时间
        # now_time====>年月日时分秒
        now_time = time.strftime("%Y%m%d,%H,%M,%S").split(",")
        day = now_time[0]
        hour = now_time[1]
        minute = int(now_time[2])
        for i in range(60, -5, -5):
            if minute < i:
                continue
            minute = i
            break

        minute = '0%s' % minute if minute < 10 else minute
        now_minute = '%s%s%s' % (day, hour, minute)

        file_names = os.listdir(_LOCAL_DATA_DIR % ('tmp'))
        logger.info("file_names:%s" % file_names)
        thread_name = self.consumer_thread_name
        logger.info("thread_name:%s" % thread_name)
        last_file_name = ''
        for file_name in file_names:
            tmp = file_name.split("#")[0]
            if tmp == thread_name:
                last_file_name = file_name
                break

        now_file_name = "%s#%s" % (thread_name, now_minute)
        try:
            if last_file_name == '' or last_file_name != now_file_name:
                # 移动老文件
                # if last_file_name != '':
                oldPath = _LOCAL_DATA_DIR % ("tmp/") + last_file_name
                logger.info("oldPath:%s" % oldPath)
                # if os.path.exists(oldPath) and os.path.getsize(oldPath) > 0:
                if last_file_name != '':
                    done_file_name = last_file_name + "#" + str(
                        TimeUtil().get_timestamp())
                    logger.info("last_file_name:%s" % last_file_name)
                    newPath = _LOCAL_DATA_DIR % ("done/") + done_file_name
                    logger.info("newPath:%s" % newPath)
                    shutil.move(oldPath, newPath)
                # 写入新文件
                now_file_name = _LOCAL_DATA_DIR % ("tmp/") + now_file_name
                # if not os.path.exists(_LOCAL_DATA_DIR+'tmp2/'):
                #     os.mkdir(_LOCAL_DATA_DIR+'tmp2/')

                logger.info("now_file_name:%s" % now_file_name)
                f = open(now_file_name, 'a+')
                f.write(str2)
                f.close()
            else:
                last_file_name = _LOCAL_DATA_DIR % ("tmp/") + last_file_name
                logger.info("last_file_name:%s" % last_file_name)
                # 写入老文件时进行换行
                insert_str = "\n" + str2
                f = open(last_file_name, 'a+')
                f.write(insert_str)
                f.close()
        except Exception, e:
            is_success = False
            traceback.print_exc(e)
def put_inner_to_queue():
    '''

    '''
    page_show_num = 10
    # 统计hainiu_queue 未处理的记录数
    select_queue_count_sql = """
    select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0;
    """
    # 统计内链接表符合条件的总记录数
    select_inner_count_sql = """
    select count(*) from hainiu_web_seed_internally where status=0;
    """
    # 分页查询内链接表
    select_inner_limit_sql = """
    select md5,a_url,a_md5,domain,a_host,a_title from hainiu_web_seed_internally WHERE
    status=0 limit 0,%s;
    """
    # 插入hainiu_queue表
    insert_queue_sql = """
    insert into hainiu_queue (type,action,params) values (%s, %s, %s);
    """
    # 更新内链接表的status状态
    update_inner_status_sql = """
    update hainiu_web_seed_internally set status=1 where a_md5=%s and md5=%s
    """
    logger = LogUtil().get_logger("download_news_queue", "download_news_queue")
    db_util = DBUtil(_HAINIU_DB)
    try:
        # 统计hainiu_queue 未处理的记录数
        sql_params = [2]
        res1 = db_util.read_one(select_queue_count_sql, sql_params)
        queue_count = res1[0]
        if queue_count >= 5:
            logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count)
            return None
        # 统计内链接表符合条件的总记录数
        res2 = db_util.read_one(select_inner_count_sql)
        inner_count = res2[0]

        # 计算有多少页
        page_num = inner_count / page_show_num if inner_count % page_show_num == 0 \
            else inner_count / page_show_num + 1
        start_time = time.time()
        # 分页查询
        for page in range(page_num):
            sql_params = [page_show_num]
            res3 = db_util.read_dict(select_inner_limit_sql, sql_params)
            # 插入队列表的记录
            insert_queue_record = []
            # param字典
            param_dict = {}
            # inner表内要进行更新的记录
            update_innner_status_record = []
            for row in res3:
                # md5,a_url,a_md5,domain,a_host,a_title
                md5 = row['md5']
                a_url = row['a_url']
                a_md5 = row['a_md5']
                domain = row['domain']
                a_host = row['a_host']
                a_title = row['a_title']
                # param数据
                param_dict['md5'] = md5
                param_dict['a_md5'] = a_md5
                param_dict['domain'] = domain
                param_dict['a_host'] = a_host
                param_dict['a_title'] = a_title

                param_json = json.dumps(param_dict,
                                        ensure_ascii=False,
                                        encoding='utf-8')
                # 将数据放入列表
                insert_queue_record.append((2, a_url, param_json))
                update_innner_status_record.append((a_md5, md5))

            db_util.executemany(insert_queue_sql, insert_queue_record)
            db_util.executemany(update_inner_status_sql,
                                update_innner_status_record)
        end_time = time.time()
        run_time = end_time - start_time
        logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time))

    except Exception, e:
        traceback.print_exc(e)
        db_util.rollback()
Exemple #21
0
def put_queue_inner():

    # 统计queue符合条件的记录数
    count_queue_sql = '''
    select count(*) from web_queue where is_work=%s and fail_times < %s;
    '''

    # # 统计internally表的符合条件的总记录数
    # count_inner_sql='''
    # select count(*) from web_seed_internally where status=0;
    # '''
    #
    # # web_seed_internally 表的记录
    # select_inner_limit_sql='''
    # select id,a_url,param from web_seed_internally where status=0 limit %s,%s;
    # '''

    # 插入queue表记录
    insert_queue_sql = '''
    insert into web_queue (type,action,params) values(%s,%s,%s);
    '''

    # web_seed_internally status
    update_sql = '''
    update web_seed_internally set status=1 where md5=%s and a_md5=%s;
        '''
    try:
        # redis_tmp 数据
        redis_d = RedisUtill()
        db_uitl = DBUtil(_ZZ_DB)

        ips = ['192.168.235.136', '192.168.235.137', '192.168.235.138']
        port = '6379'
        list = []
        total_num = 0
        is_get_lock = redis_d.get_lock('seed_lock', 10)
        logger = LogUtil().get_base_logger()

        sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']]
        res1 = db_uitl.read_one(count_queue_sql, sql_params)
        total_num1 = res1[0]
        if total_num1 != 0:
            logger.info("queue has %d records,not insert!" % total_num1)
            return None

        logger.info("正在获取锁...")
        if is_get_lock:
            logger.info("获取到锁")
            start_time = time.time()

            def scan_limit_to_queue_table(host, port, cursor, match, count):
                r = redis.Redis(host, port)
                rs = r.scan(cursor, match, count)
                # 新游标
                next_num = rs[0]
                # print rs
                li = rs[1]
                for i in li:
                    if i.__contains__('a_url'):
                        list.append(i)

                # 递归出口
                if next_num == 0:
                    return None
                scan_limit_to_queue_table(host, port, next_num, match, count)

            for ip in ips:
                scan_limit_to_queue_table(ip, port, 0, 'seed_temp*', 100)

            # 分页插入queue表
            redis_result = []
            up_inner = []
            delete_list = []
            for k in list:
                if k.__contains__('a_url'):

                    # 确定同一MD5的其他参数的key
                    param = k.replace('a_url', 'param')
                    md5 = k.replace('a_url', 'md5')
                    a_md5 = k.replace('a_url', 'a_md5')

                    action = redis_d.get_value_for_key(k)
                    params = redis_d.get_value_for_key(param)
                    redis_result.append((2, action, params))

                    md5_val = redis_d.get_value_for_key(md5)
                    a_md5_val = redis_d.get_value_for_key(a_md5)
                    up_inner.append((md5_val, a_md5_val))
                    # 添加要删除的列表
                    delete_list.append(k)
                    delete_list.append(param)
                    delete_list.append(md5)
                    delete_list.append(a_md5)
                    total_num += 1

                # 批量插入queue
                if (len(redis_result) == 5):
                    db_uitl.executemany(insert_queue_sql, redis_result)
                    db_uitl.executemany(update_sql, up_inner)
                    redis_result = []
                    up_inner = []
            # 提交不满五个的最后一组
            db_uitl.executemany(insert_queue_sql, redis_result)
            db_uitl.executemany(update_sql, up_inner)
            # 删除redis_tmp
            redis_d.delete_batch(delete_list)

            redis_d.release('seed_lock')
            logger.info("释放锁")
        else:
            logger.info('其他线程正在处理,获取锁超过最大超时时间,退出处理逻辑 ')

        end_time = time.time()
        run_time = end_time - start_time
        logger.info("total_num:%d, run_time:%.2f" % (total_num, run_time))

    except Exception, err:
        db_uitl.rollback()
        redis_d.release('seed_lock')
        traceback.print_exc(err)
Exemple #22
0
def put_seed_to_queue(page_show_num):
    '''
    采用分页查询种子表数据,批量导入到hainiu_queue
    :param page_show_num: 一次查询条数
    '''
    # 统计hainiu_queue 未处理的记录数
    select_queue_count_sql = """
    select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0;
    """

    # 统计种子表符合条件的总记录数
    select_seed_count_sql = """
    select count(*) from hainiu_web_seed where status=0;
    """

    # 分页查询种子表数据SQL
    select_seed_limit_sql = """
    select url, md5, domain, host, category from hainiu_web_seed
    where status=0 limit %s,%s;
     """

    # insert hainiu_queue sql
    insert_queue_sql = """
    insert into hainiu_queue (type,action,params) values (%s, %s, %s);
    """
    logger = LogUtil().get_logger("news_find_queue", "news_find_queue")
    db_util = DBUtil(_HAINIU_DB)
    try:
        #1) 统计hainiu_queue 未处理的记录数
        sql_params = [1]
        # res1 是 ()
        res1 = db_util.read_one(select_queue_count_sql, sql_params)
        queue_count = res1[0]
        if queue_count >= 5:
            logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count)
            return None

        start_time = time.time()

        #2) 统计种子表符合条件的总记录数
        res2 = db_util.read_one(select_seed_count_sql)
        seed_count = res2[0]

        # 计算有多少页
        page_num = seed_count / page_show_num if seed_count % page_show_num == 0 \
            else seed_count / page_show_num + 1

        # 分页查询
        for i in range(page_num):
            sql_params = [i * page_show_num, page_show_num]
            # ({},{},{},{},{})
            res3 = db_util.read_dict(select_seed_limit_sql, sql_params)
            # 插入队列表的数据
            insert_queue_values = []

            params_dict = {}
            for row in res3:
                # url, md5, domain, host, category
                act = row['url']
                md5 = row['md5']
                domain = row['domain']
                host = row['host']
                category = row['category']
                params_dict['md5'] = md5
                params_dict['domain'] = domain
                params_dict['host'] = host
                params_dict['category'] = category

                params_json = json.dumps(params_dict,
                                         ensure_ascii=False,
                                         encoding='utf-8')

                insert_queue_values.append((1, act, params_json))
            # 把查询的数据批量插入到队列表
            db_util.executemany(insert_queue_sql, insert_queue_values)

        end_time = time.time()
        run_time = end_time - start_time
        logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (seed_count, run_time))

    except Exception, e:
        logger.exception(e)
Exemple #23
0
#-*- encoding: utf-8 -*-
'''
log_demo.py
Created on 21-1-30 上午11:23
Copyright (c) 21-1-30, 海牛学院版权所有.
@author: 潘牛
'''
from commons.util.log_util import LogUtil

logger1 = LogUtil().get_logger("log_name", "log_file")

logger2 = LogUtil().get_logger("log_name", "log_file")

# 两个对象指向同一内存地址
print logger1 is logger2

logger1.info("测试 info 级别")
logger1.error("测试 error 级别")

try:
    1 / 0
except Exception, e:
    logger1.exception(e)
Exemple #24
0
def xpath_config_file():
    select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0"""
    rl = LogUtil().get_base_logger()
    try:
        # _HAINIU_DB = {'HOST': '192.168.137.190', 'USER': '******', 'PASSWD': '12345678', 'DB': 'hainiucrawler',
        #             'CHARSET': 'utf8', 'PORT': 3306}
        d = DBUtil(config._HAINIU_DB)
        # d = DBUtil(_HAINIU_DB)
        r = redis.Redis('nn1.hadoop', 6379, db=6)
        # r = redis.Redis('redis.hadoop', 6379, db=6)
        f = FileUtil()
        t = TimeUtil()
        c = Client("http://nn1.hadoop:50070")

        time_str = t.now_time(format='%Y%m%d%H%M%S')
        # local_xpath_file_path = '/Users/leohe/Data/input/xpath_cache_file/xpath_file' + time_str
        local_xpath_file_path = '/home/qingniu/xpath_cache_file/xpath_file' + time_str

        start_cursor = 0
        is_finish = True
        starttime = time.clock()
        host_set = set()

        while is_finish:
            values = set()
            limit = r.scan(start_cursor, 'total:*', 10)
            if limit[0] == 0:
                is_finish = False
            start_cursor = limit[0]
            for h in limit[1]:
                host = h.split(":")[1]
                total_key = h
                txpath_key = 'txpath:%s' % host
                fxpath_key = 'fxpath:%s' % host
                total = r.get(total_key)

                txpath = r.zrevrange(txpath_key, 0, 1)
                row_format = "%s\t%s\t%s\t%s"
                if txpath:
                    # print 'txpath:%s' % txpath
                    txpath_num = int(r.zscore(txpath_key, txpath[0]))
                    if txpath.__len__() == 2:
                        txpath_num_1 = int(r.zscore(txpath_key, txpath[1]))
                        txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0

                    # print 'txpath_max_num:%s' % txpath_num
                    if txpath_num / float(total) >= 0.8:
                        values.add(row_format % (host, txpath[0], 'true', '0'))
                        host_set.add(host)
                    else:
                        if txpath_num >= 1:
                            values.add(row_format %
                                       (host, txpath[0], 'true', '0'))
                            host_set.add(host)
                        if txpath_num_1 is not None and txpath_num_1 >= 1:
                            values.add(row_format %
                                       (host, txpath[1], 'true', '0'))
                            host_set.add(host)

                fxpath = r.smembers(fxpath_key)
                if fxpath:
                    # print 'fxpath:%s' % fxpath
                    for fx in fxpath:
                        values.add(row_format % (host, fx, 'false', '0'))
                    host_set.add(host)

                sql = select_xpath_rule_sql % host
                list_rule = d.read_tuple(sql)
                for rule in list_rule:
                    type = rule[2]
                    if type == 0:
                        values.add(row_format %
                                   (rule[0], rule[1], 'true', '2'))
                        host_set.add(host)
                    elif type == 1:
                        values.add(row_format %
                                   (rule[0], rule[1], 'false', '3'))
                        host_set.add(host)

            f.write_file_line_pattern(local_xpath_file_path, values, "a")
        #上传到HDFS的XPATH配置文件目录
        c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('total host %s,action time %s\'s' %
                (host_set.__len__(), worksec))
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
Exemple #25
0
def redis2Hdfs():

    select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(_ZZ_DB)

        start = 0
        is_finish = True
        host_set = set()

        f = FileUtil()
        t = TimeUtil()
        time_str = t.now_time(format='%Y%m%d%H%M%S')
        #local_xpath_file_path = '/user/zengqingyong17/spark/xpath_cache_file' + time_str
        local_xpath_file_path = 'E:/python_workspaces/data/xpath/xpath_file' + time_str

        starttime = time.clock()
        r = redis.Redis('nn1.hadoop', '6379', db=6)
        while is_finish:
            values = set()
            rs = r.scan(start, "total_z:*", 10)
            # 新游标
            start = rs[0]
            if start ==0:
                is_finish = False
            # print rs
            for i in rs[1]:
                host = i.split(":")[1]
                total_key = i
                txpath_key = 'txpath_z:%s' % host
                fxpath_key = 'fxpath_z:%s' % host
                total = r.get(total_key)

                # 降序排序获得次数(0,1)
                txpath = r.zrevrange(txpath_key, 0, 1)
                row_format = "%s\t%s\t%s\t%s"

                if txpath:
                    txpath_num = int(r.zscore(txpath_key, txpath[0]))
                    if txpath.__len__() == 2:
                        # 返回txpath_key 中txpath[1]的数值
                        txpath_num_1 = int(r.zscore(txpath_key, txpath[1]))
                        txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0
                    if txpath_num / float(total) >= 0.8:
                        values.add(row_format % (host, txpath[0], 'true', '0'))
                        host_set.add(host)
                    else:
                        if txpath_num >= 100:
                            values.add(row_format % (host, txpath[0], 'true', '0'))
                            host_set.add(host)
                        if txpath_num_1 is not None and txpath_num_1 >= 100:
                            values.add(row_format % (host, txpath[1], 'true', '0'))
                            host_set.add(host)

                # 获得fxpath_key的全部值
                fxpath = r.smembers(fxpath_key)
                if fxpath:
                    # print 'fxpath:%s' % fxpath
                    for fx in fxpath:
                        values.add(row_format % (host, fx, 'false', '1'))
                    host_set.add(host)

                sql = select_xpath_rule_sql % host
                list_rule = d.read_tuple(sql)
                for rule in list_rule:
                    type = rule[2]
                    if type == 0:
                        values.add(row_format % (rule[0], rule[1], 'true', '2'))
                        host_set.add(host)
                    elif type == 1:
                        values.add(row_format % (rule[0], rule[1], 'false', '3'))
                        host_set.add(host)

            f.write_file_line_pattern(local_xpath_file_path, values, "a")

        #上传到HDFS的XPATH配置文件目录
        # c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('total host %s,action time %s\'s' % (host_set.__len__(), worksec))
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
Exemple #26
0
 def __init__(self, limit, fail_times):
     self.limit = limit
     self.fail_times = fail_times
     self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)