Ejemplo n.º 1
0
class HainiuConsumerAction(ConsumerAction):
    def __init__(self, id, act, params, max_fail_times):
        super(self.__class__, self).__init__()
        self.id = id
        self.act = act
        self.params = params
        self.max_fail_times = max_fail_times

        self.logger = LogUtil().get_logger("HainiuConsumerAction",
                                           "HainiuConsumerAction")

    def action(self):
        print 'id=%s, action=%s, parmas=%s' % (self.id, self.act, self.params)

        return self.result(True, [self.id, self.act, self.params])

    def success_action(self):
        """
        删除队列的数据信息
        """
        sql = "delete from hainiu_queue where id=%s" % self.id

        try:
            db_util = DBUtil(db_config)

            db_util.execute(sql)
        except Exception, message:
            self.logger.exception(message)
        finally:
Ejemplo n.º 2
0
    def __init__(self, queue, p_action, name, p_sleep_time, c_max_num,
                 c_max_sleep_time, c_retry_times):
        """
        生产者线程初始化参数
        :param queue:            队列
        :param p_action:         生产动作对象实例
        :param name:             线程名称
        :param p_sleep_time:     生产线程每多长时间工作一次
        :param c_max_num:        消费线程的最大线程数
        :param c_max_sleep_time: 消费线程工作间隔最大休眠时间
        :param c_retry_times:    消费动作对象action 最大重试次数

        """
        super(self.__class__, self).__init__()
        self.queue = queue
        self.p_action = p_action
        self.name = name
        self.p_sleep_time = p_sleep_time
        self.c_max_num = c_max_num
        self.c_max_sleep_time = c_max_sleep_time
        self.c_retry_times = c_retry_times

        #校验p_action 是不是 ProducerAction的子类,如果不是抛异常
        if not isinstance(self.p_action, ProducerAction):
            raise Exception("%s is not ProducerAction instance" %
                            self.p_action.__name__)
        #初始化logger
        self.logger = LogUtil().get_logger("producer_%s" % self.name,
                                           "producer_%s" % self.name)
Ejemplo n.º 3
0
 def __init__(self, url, param, queue_id, pro_flag, queue_name):
     ConsumerAction.__init__(self)
     self.url = url[:-1] if url.endswith('/') else url
     self.param = param
     self.queue_id = queue_id
     self.pro_flag = pro_flag
     self.queue_name = queue_name
     self.logger = LogUtil().get_logger('consumer', 'consumer' + queue_name)
Ejemplo n.º 4
0
    def __init__(self, id, act, params, max_fail_times):
        super(self.__class__, self).__init__()
        self.id = id
        self.act = act
        self.params = params
        self.max_fail_times = max_fail_times

        self.logger = LogUtil().get_logger("HainiuConsumerAction",
                                           "HainiuConsumerAction")
Ejemplo n.º 5
0
 def __init__(self, queue, name, max_sleep_time, retry_times):
     super(self.__class__, self).__init__()
     self.queue = queue
     self.name = name
     self.max_sleep_time = max_sleep_time
     self.retry_times = retry_times
     Consumer._MAX_RETRY_TIMES = retry_times
     #初始化日志
     self.logger = LogUtil().get_logger("comsumer_%s" % self.name,
                                        "comsumer_%s" % self.name)
Ejemplo n.º 6
0
def push_queue_items():
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);"""
    count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;"""
    selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;"""
    update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info(
                'last download_page queue not finish,last queue %s unFinish' %
                (queue_total))
            return

        starttime = time.clock()
        d = DBUtil(config._HAINIU_DB)
        total = long(d.read_one(count_news_seed_internally_sql)[0])
        page_size = 2
        page = total / page_size
        for i in range(0, page + 1):
            sql = selec_news_seed_internally_sql % (0, page_size)
            list = d.read_tuple(sql)
            values = []
            id_values = []
            for l in list:
                url = l[0]
                url = url if url is not None else ''
                param = l[1]
                param1 = param if param is not None else ''

                id = l[2]

                param = '%s##%s' % (str(id), param1)
                values.append((url, param))

                id_values.append(str(id))
            if id_values.__len__() != 0:
                d.executemany_no_commit(
                    insert_news_seed_internally_queue_items_sql, values)
                ids = ','.join(id_values)
                sql = update_news_seed_internally_sql % (ids)
                d.execute(sql)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push seed_internally queue finish,total items %s,action time %s\'s'
            % (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
Ejemplo n.º 7
0
class HainiuProducerAction(ProducerAction):
    def __init__(self, max_fail_times, limit_num):
        super(self.__class__, self).__init__()
        self.max_fail_times = max_fail_times
        self.limit_num = limit_num
        self.logger = LogUtil().get_logger('HainiuProducerAction',
                                           'HainiuProducerAction')

    def queue_items(self):
        #多台机器的时候,查询带上 fail_ip != ip
        # select_sql = """
        # select id, action, params from hainiu_queue \
        # where type='1' and is_work = 0 and fail_ip != '%s' and  fail_times < %d limit 0, %d for update;
        # """

        #行锁
        select_sql = """
        select id, action, params from hainiu_queue \
        where type='1' and is_work = 0 and fail_times < %d limit 0, %d for update;
        """

        update_sql = """
        update hainiu_queue set is_work=1  where id in (%s);
        """
        list = []
        try:
            db_util = DBUtil(db_config)
            #多个行
            result = db_util.read_dict(select_sql %
                                       (self.max_fail_times, self.limit_num))
            ids = []

            for row_dict in result:
                id = row_dict['id']
                action = row_dict['action']
                params = row_dict['params']

                c_action = HainiuConsumerAction(id, action, params,
                                                self.max_fail_times)
                list.append(c_action)
                #[1,2,3,4]
                ids.append(str(id))

            if len(ids) != 0:
                ids = ','.join(ids)
                db_util.execute_no_commit(update_sql % ids)
            db_util.commit()
        except Exception, message:
            db_util.rollback_close()
            self.logger.exception(message)

        finally:
Ejemplo n.º 8
0
class DownLoadProducer(ProducerAction):
    def __init__(self, limit, pro_flag, fail_times, queue_name):
        self.limit = limit
        self.fail_times = fail_times
        self.pro_flag = pro_flag
        self.queue_name = queue_name
        self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)

    def queue_items(self):
        # select_queue_sql = """
        # select id,action,params from hainiu_queue where
        # type=3 and is_work =0 and fail_times <=%s and fail_ip <> '%s'
        # limit 0,%s for update;
        # """

        select_queue_sql = """
        select id,action,params from hainiu_queue where 
        type=3 and is_work =0 and fail_times <=%s
        limit 0,%s for update;
        """

        update_queue_sql = """
        update hainiu_queue set is_work=1 where id in (%s);
        """

        list = []
        try:
            d = DBUtil(config._HAINIU_DB)
            sql = select_queue_sql % (self.fail_times,self.limit)
            tuple = d.read_tuple(sql)
            if len(tuple) == 0:
                return list
            queue_ids = ''
            for t in tuple:
                queue_id = t[0]
                url = t[1]
                param = '' if t[2] is None else t[2]
                queue_ids += str(queue_id) + ','
                c = DownLoadConsumer(url, param, queue_id, self.pro_flag, self.queue_name)
                list.append(c)
            queue_ids = queue_ids[:-1]
            d.execute(update_queue_sql % (queue_ids))
        except:
            self.rl.exception()
            d.rollback()
            d.commit()
        finally:
            d.close()
        return list
Ejemplo n.º 9
0
def push_queue_items():
    count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;"""
    select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;"""
    insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);"""
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last news_find queue not finish,last queue %s unFinish' %
                    (queue_total))
            return

        starttime = time.clock()
        total = long(d.read_one(count_news_seed_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = select_news_seed_sql % (i * page_size, page_size)
            list = d.read_tuple(sql)
            values = []
            for l in list:
                url = l[0]
                publisher = get_fld(url)
                publisher = publisher[0:publisher.index((
                    '.'))] if publisher.__contains__('.') else publisher
                param = {}
                param['category'] = l[1]
                param['publisher'] = publisher
                param = json.dumps(param, ensure_ascii=False)
                values.append((url, param))

            if values.__len__() != 0:
                random.shuffle(values)
                d.executemany(insert_news_seed_queue_items_sql, values)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push news_find queue finish,total items %s,action time %s\'s' %
            (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
Ejemplo n.º 10
0
class DemoConsumerAction(ConsumerAction):
    def __init__(self, name):
        super(self.__class__, self).__init__()
        self.name = name

        self.logger = LogUtil().get_logger("DemoConsumerAction",
                                           'DemoConsumerAction')

    def action(self):
        self.logger.info('consume %s' % self.name)

        flag = True

        return self.result(flag, [self.name])

    def success_action(self):
        print 'success_op() ==> %s' % self.name

    def fail_action(self):
        print 'fail_op() ==> %s' % self.name
Ejemplo n.º 11
0
class Producer(threading.Thread):
    """
    生产者线程
    """
    def __init__(self, queue, p_action, name, p_sleep_time, c_max_num,
                 c_max_sleep_time, c_retry_times):
        """
        生产者线程初始化参数
        :param queue:            队列
        :param p_action:         生产动作对象实例
        :param name:             线程名称
        :param p_sleep_time:     生产线程每多长时间工作一次
        :param c_max_num:        消费线程的最大线程数
        :param c_max_sleep_time: 消费线程工作间隔最大休眠时间
        :param c_retry_times:    消费动作对象action 最大重试次数

        """
        super(self.__class__, self).__init__()
        self.queue = queue
        self.p_action = p_action
        self.name = name
        self.p_sleep_time = p_sleep_time
        self.c_max_num = c_max_num
        self.c_max_sleep_time = c_max_sleep_time
        self.c_retry_times = c_retry_times

        #校验p_action 是不是 ProducerAction的子类,如果不是抛异常
        if not isinstance(self.p_action, ProducerAction):
            raise Exception("%s is not ProducerAction instance" %
                            self.p_action.__name__)
        #初始化logger
        self.logger = LogUtil().get_logger("producer_%s" % self.name,
                                           "producer_%s" % self.name)

    def run(self):

        list = []
        while True:
            try:
                #获取starttime
                start_time = time.time()

                #判断list是否是空的,如果是,就调用 p_action.queue_ites(),
                # 生产 ConsumerAction 子类实例列表
                if len(list) == 0:
                    list = self.p_action.queue_items()

                #计算本次生产了多少
                total_num = len(list)

                #打印日志
                self.logger.info(
                    "queue.name=【producer_%s】, current time produce %d "
                    "actions" % (self.name, total_num))

                while True:
                    #列表空了,就出去继续生产
                    if len(list) == 0:
                        break

                    #当队列的未完成数量小于等于最大消费线程数,就往queue里面put
                    if self.queue.unfinished_tasks <= self.c_max_num:
                        c_action = list.pop()

                        self.queue.put(c_action)

                # 获取endtime
                end_time = time.time()

                run_time = end_time - start_time

                # 计算每分钟生产多少个
                if run_time == 0:
                    rate = total_num
                else:
                    rate = round(float(total_num * 60) / run_time, 2)

                self.logger.info(
                    "queue.name=【producer_%s】, total_num=%d,"
                    " produce %d actions/min, sleep_time=%d" %
                    (self.name, total_num, rate, self.p_sleep_time))

                # 睡眠
                time.sleep(self.p_sleep_time)

            except Exception, message:
                self.logger.exception(message)
Ejemplo n.º 12
0
class Consumer(threading.Thread):

    _MAX_RETRY_TIMES = 0

    def __init__(self, queue, name, max_sleep_time, retry_times):
        super(self.__class__, self).__init__()
        self.queue = queue
        self.name = name
        self.max_sleep_time = max_sleep_time
        self.retry_times = retry_times
        Consumer._MAX_RETRY_TIMES = retry_times
        #初始化日志
        self.logger = LogUtil().get_logger("comsumer_%s" % self.name,
                                           "comsumer_%s" % self.name)

    def run(self):
        while True:
            try:
                #如果队列是空的,就睡眠一会,继续判断
                if self.queue.empty():
                    time.sleep(self.max_sleep_time)
                    continue

                #获取开始时间
                start_time = time.time()

                #从队列(queue)里取出action
                action = self.queue.get()

                action.consumer_thread_name = self.name

                #在调用action()进行消费
                result = action.action()

                rs = 'SUCCESS' if result[0] else 'FAIL'

                #获取结束时间
                end_time = time.time()

                #获取随机休眠时间
                random_sleep_time = round(
                    random.uniform(0.2, self.max_sleep_time), 2)

                run_time = end_time - start_time

                #打印日志
                self.logger.info(
                    "queue.name=【comsumer_%s】, run_time=%d, sleep_time=%d, retry_times=%d, "
                    " result=%s, detail=%s" %
                    (self.name, run_time, random_sleep_time,
                     action.current_retry_times, rs, result[1:]))

                #判断结果成功还是失败,如果是失败,并且失败次数小于最大重试次数,需要重试
                if not result[
                        0] and action.current_retry_times < self.retry_times:
                    action.current_retry_times += 1
                    self.queue.put(action)

                #无论成功失败都要执行
                self.queue.task_done()

                #随机睡眠
                time.sleep(random_sleep_time)
            except Exception, message:
                self.logger.exception(message)
Ejemplo n.º 13
0
class DownLoadConsumer(ConsumerAction):
    def __init__(self, url, param, queue_id, pro_flag, queue_name):
        ConsumerAction.__init__(self)
        self.url = url[:-1] if url.endswith('/') else url
        self.param = param
        self.queue_id = queue_id
        self.pro_flag = pro_flag
        self.queue_name = queue_name
        self.logger = LogUtil().get_logger('consumer', 'consumer' + queue_name)

    def action(self):
        is_success = True
        t = TimeUtil()
        file_util = FileUtil()
        u = Util()
        hu = HtmlUtil()
        r = RequestUtil()
        values = []
        md5 = u.get_md5(self.url)
        update_time = t.get_timestamp()
        create_time = update_time
        create_day = int(t.now_day().replace('-', ''))
        create_hour = int(t.now_hour())
        now_minute = int(t.now_min())
        #以5分钟为间隔时间计算
        for i in xrange(60,-5,-5):
            if now_minute>=i:
                now_minute=i
                break
        #格式化成yyyyMMddHHmm,如:201903181505
        now_minute = t.now_time(format='%Y%m%d%H') + ('0%s' % (str(now_minute)) if now_minute < 10 else str(now_minute))

        values.append(MySQLdb.escape_string(self.url))
        values.append(md5)
        values.append(create_time)
        values.append(create_day)
        values.append(create_hour)
        values.append('')
        values.append(MySQLdb.escape_string(self.param))
        values.append(update_time)
        try:
            html = r.http_get_phandomjs(self.url)
            domain = hu.get_url_domain(self.url)
            values[5] = domain

            soup = BeautifulSoup(html, 'lxml')
            title_doc = soup.find('title')
            title = title_doc.contents[0] if title_doc is not None and len(title_doc.contents) == 1 else ''

            host = hu.get_url_host(self.url)
            values.append(host)
            values.append(MySQLdb.escape_string(title))

            # k = KafkaUtil(config._KAFKA_CONFIG)
            # html = html.replace(content._SEQ1,'').replace(content._SEQ2,content._SEQ4)
            # push_str = content._SEQ3.join(('%s','%s')) % (self.url,html)
            # push_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(push_str),push_str)
            # push_str = bytes(push_str)
            # is_success = k.push_message(push_str)

            if is_success:
                self.save_file(create_time,file_util,now_minute,u,self.url,html)
            else:
                self.logger.error("kafka push error")

        except:
            is_success = False
            values.append('')
            values.append('')
            self.logger.exception()
        finally:
            r.close_phandomjs()

        try:
            if is_success:
                values.append(1)
                insert_web_page_sql = """
                    insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host,
                    title,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s",%s) on DUPLICATE KEY  UPDATE update_time=values(update_time);
                """
            else:
                ip = u.get_local_ip()
                values.append(ip)
                values.append(2)
                insert_web_page_sql = """
                    insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host,
                    title,fail_ip,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s","%s",%s)
                    on DUPLICATE KEY UPDATE fail_times=fail_times+1,fail_ip=values(fail_ip);
                """

            d = DBUtil(config._HAINIU_DB)
            sql = insert_web_page_sql % tuple(values)
            d.execute(sql)
        except:
            is_success = False
            self.logger.exception()
            self.logger.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()


        return super(self.__class__, self).result(is_success, [md5,update_time,self.queue_id])


    def success_action(self, values):
        delete_sql = """
            delete from hainiu_queue where id=%s;
        """
        update_hainiu_news_internally_sql = """
            update hainiu_web_seed_internally set update_time=%s where a_md5="%s";
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[2]
            sql = delete_sql % id
            # TODO 测试不删除队列表
            d.execute_no_commit(sql)
            sql = update_hainiu_news_internally_sql % (values[2],values[0])
            d.execute_no_commit(sql)
            d.commit()
        except:
            self.logger.exception()
            self.logger.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()



    def fail_action(self, values):
        update_sql = """
            update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
        """
        update_sql_1 = """
            update hainiu_queue set is_work=0 where id=%s;
        """
        update_hainiu_news_internally_sql = """
            update hainiu_web_seed_internally set fail_times=fail_times+1,fail_ip="%s",update_time=%s where a_md5="%s";
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[2]
            u = Util()
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute_no_commit(sql)
            sql = update_hainiu_news_internally_sql % (ip, values[1], values[0])
            d.execute_no_commit(sql)
            if (self.current_retry_times == Consumer._MAX_RETRY_TIMES):
                sql = update_sql_1 % (id)
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.logger.exception()
            self.logger.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()


    def save_file(self, create_time, file_util, now_minute, u, url, html):
        #downloadnews_1_one_201903181505
        # TODO 单机调试下载文件
        # self.consumer_thread_name = "downloadnews"
        # html_file_path_cache[self.consumer_thread_name] = 'downloadnews_one_201903211115'
        now_file_name = '%s_%s_%s' % (self.consumer_thread_name, self.pro_flag, now_minute)
        #从文件缓存字典中根据当前线程名称获取 last_file_name
        last_file_name = u.get_dict_value(html_file_path_cache, self.consumer_thread_name)
        print 'last_file_name==>%s' % last_file_name
        print 'now_file_name==>%s' % now_file_name
        #再把now_file_name 作为当前线程的名称的value 放入字典
        html_file_path_cache[self.consumer_thread_name] = now_file_name
        #/tmp/python/hainiu_cralwer/data/tmp/downloadnews_1_one
        tmp_path = config._LOCAL_DATA_DIR % ('%s/%s_%s' % ('tmp', self.consumer_thread_name, self.pro_flag))
        #默认换行
        start_char = content._SEQ2
        #如果是首次写文件或者 写新文件,不换行
        if last_file_name is None or now_file_name != last_file_name:
            start_char = ''
            #如果最后的文件存在且里面有数据,则mv到done目录下,并重命名
            if os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 0:
                #/tmp/python/hainiu_cralwer/data/done/downloadnews_1_one_201903181505_1545376668
                done_path = config._LOCAL_DATA_DIR % ('%s/%s_%s' % ('done', now_file_name, create_time))
                shutil.move(tmp_path, done_path)
        #如果不是新文件,就继续往里写数据
        html = html.replace(content._SEQ1,'').replace(content._SEQ2,content._SEQ4)
        record_str = content._SEQ3.join(('%s','%s')) % (url,html)
        record_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(record_str),record_str)
        html_record_format_str = start_char + record_str
        file_util.write_file_content_pattern(tmp_path, html_record_format_str, pattern='a')
Ejemplo n.º 14
0
 def __init__(self, max_fail_times, limit_num):
     super(self.__class__, self).__init__()
     self.max_fail_times = max_fail_times
     self.limit_num = limit_num
     self.logger = LogUtil().get_logger('HainiuProducerAction',
                                        'HainiuProducerAction')
Ejemplo n.º 15
0
    def __init__(self, name):
        super(self.__class__, self).__init__()
        self.name = name

        self.logger = LogUtil().get_logger("DemoConsumerAction",
                                           'DemoConsumerAction')
Ejemplo n.º 16
0
def push_queue_items():
    # 符合 写入的种子的队列数据的数量
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    # 生成写入队列数据 条件: type=3
    insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);"""
    # 日志
    rl = LogUtil().get_base_logger()

    redisdb = RedisUtill()
    try:

        # 开始时间
        starttime = time.clock()

        redis_data_statu = True
        # 线程锁
        lock_key = 'get_news_seed_internally_data'
        sql = ""
        total_all = 0

        d = DBUtil(config._HAINIU_DB)
        d.execute_no_commit("set NAMES utf8mb4;")
        #符合 写入的种子的队列数据的数量 --- 之前的队列数据还没有处理完,所以不重新写队列数据到队列中
        sql = count_news_seed_queue_sql

        queue_total = d.read_one(sql)[0]
        if queue_total != 0:
            rl.info(
                'last download_page queue not finish,last queue %s unFinish' %
                (queue_total))
            # return

        while redis_data_statu:

            is_lock = redisdb.get_conn().exists(lock_key)

            if is_lock == False:
                #锁上线程  --- 10 秒失效
                lockd = redisdb.get_lock(lock_key, 10)
                if lockd == False:
                    rl.info('无法获取线程锁,退出采集下载queue线程 ')
                    continue

                ips = config._REDIS_CLUSTER_CONFIG['IPS']
                port = config._REDIS_CLUSTER_CONFIG['PORT']

                def scan_limit_to_queue_table(host, port, cursor, match,
                                              count):
                    total_num = 0
                    r = redis.Redis(host, port)
                    rs = r.scan(cursor, match, count)
                    next_num = rs[0]
                    key_list = []
                    value_list = []
                    for k in rs[1]:
                        key_list.append(k)
                        total_num += 1

                    # print key_list
                    print total_num
                    values = redisdb.get_values_batch_keys(key_list)

                    for v in values:
                        value_list.append((v, ''))
                    print value_list

                    sql = insert_news_seed_internally_queue_items_sql
                    d.executemany(sql, value_list)

                    redisdb.delete_batch(rs[1])

                    if next_num == 0:
                        return total_num
                    return total_num + scan_limit_to_queue_table(
                        host, port, next_num, match, count)

                total_num = 0
                for ip in ips:
                    total_num += scan_limit_to_queue_table(
                        ip, port, 0, 'down:*', 10)
                    print '======'
                print total_num

                if total_num > 0:
                    break

                redisdb.release(lock_key)
            else:
                rl.info('其他线程正在处理,请等待 ')
                time.sleep(0.3)
        endtime = time.time()
        # 一共执行的时间
        worksec = int(round((endtime - starttime)))
        # 日志

        rl.info(
            'push seed_internally queue finish,total items %s,action time %s\'s'
            % (total_all, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        redisdb.release(lock_key)
        d.close()
Ejemplo n.º 17
0
 def __init__(self, limit, fail_times):
     self.limit = limit
     self.fail_times = fail_times
     self.rl = LogUtil().get_logger('NewsFindProducer', 'NewsFindProducer')
Ejemplo n.º 18
0
 def __init__(self, limit, pro_flag, fail_times, queue_name):
     self.limit = limit
     self.fail_times = fail_times
     self.pro_flag = pro_flag
     self.queue_name = queue_name
     self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)
Ejemplo n.º 19
0
 def __init__(self, url, param, queue_id):
     ConsumerAction.__init__(self)
     self.url = url[:-1] if url.endswith('/') else url
     self.param = param
     self.queue_id = queue_id
     self.rl = LogUtil().get_logger('NewsFindConsumer', 'NewsFindConsumer')
Ejemplo n.º 20
0
class NewsFindConsumer(ConsumerAction):
    def __init__(self, url, param, queue_id):
        ConsumerAction.__init__(self)
        self.url = url[:-1] if url.endswith('/') else url
        self.param = param
        self.queue_id = queue_id
        self.rl = LogUtil().get_logger('NewsFindConsumer', 'NewsFindConsumer')

    def action(self):
        is_success = True
        t = TimeUtil()
        u = Util()
        hu = HtmlUtil()
        r = RequestUtil()

        redis_util = RedisUtill()
        redis_dict_values = {}
        #
        redis_dict_keys = []

        in_values = []
        ex_values = []
        a_href = ''
        main_md5 = u.get_md5(self.url)
        update_time = t.get_timestamp()
        print update_time
        create_time = update_time
        create_day = int(t.now_day().replace('-', ''))
        create_hour = int(t.now_hour())
        try:
            html = r.http_get_phandomjs(self.url)
            domain = hu.get_url_domain(self.url)

            soup = BeautifulSoup(html, 'lxml')
            a_docs = soup.find_all("a")
            a_set = set()
            a_param = {}
            out_json_srt = ''
            status = 0
            host = hu.get_url_host(self.url)

            for a in a_docs:
                a_href = hu.get_format_url(a, host)
                a_title = a.get_text().strip()
                if a_href == '' or a_title == '':
                    continue
                if a_set.__contains__(a_href):
                    continue
                a_set.add(a_href)

                req = urllib2.Request(url=a_href)
                a_host = req.get_host() if req.get_host() is not None else ''
                a_md5 = u.get_md5(a_href)

                if a_title != '':
                    a_param['title'] = a_title
                    out_json_srt = json.dumps(a_param, ensure_ascii=False)

                a_xpath = hu.get_dom_parent_xpath_js(a)
                insert_values = (main_md5, domain, host, a_md5, a_host,
                                 a_xpath, create_time, create_day, create_hour,
                                 update_time, status,
                                 MySQLdb.escape_string(self.url),
                                 MySQLdb.escape_string(a_href),
                                 MySQLdb.escape_string(a_title), out_json_srt)
                # print insert_values
                if a_host.__contains__(domain):
                    in_values.append(insert_values)

                    dict_exist_key = "exist:%s" % a_md5
                    redis_dict_values[dict_exist_key] = a_href
                    redis_dict_keys.append(dict_exist_key)
                else:
                    ex_values.append(insert_values)

            in_table = 'hainiu_web_seed_internally'
            ex_table = 'hainiu_web_seed_externally'
            insert_sql = """
                insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param)
                      values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time) ;
            """
            try:
                d = DBUtil(config._HAINIU_DB)
                #设置会话字符集为 utf8mb4
                d.execute_no_commit("set NAMES utf8mb4;")
                if in_values.__len__() != 0:
                    sql = insert_sql.replace('<table>', in_table)
                    d.executemany_no_commit(sql, in_values)

                    #拿key去redis查是否存在  exist:a_md5,得到这些key对应的values,也就是url列表
                    redis_exist_values = redis_util.get_values_batch_keys(
                        redis_dict_keys)
                    #将存在的values列表转换成exist:a_md5形式
                    redis_exist_keys = [
                        "exist:%s" % u.get_md5(rev)
                        for rev in redis_exist_values if rev != None
                    ]

                    #判断本次入库的数据中那些是在redis中存在的,如果不存在就生成down:a_md5   exits:a_md5这两个key放到redis中
                    redis_dict_down_values = {}
                    for key, value in redis_dict_values.items():
                        if key not in redis_exist_keys:
                            redis_dict_down_values["down:%s" %
                                                   u.get_md5(value)] = value
                            redis_dict_down_values[key] = value

                    if redis_dict_down_values.__len__() != 0:
                        redis_util.set_batch_datas(redis_dict_down_values)

                if ex_values.__len__() != 0:
                    sql = insert_sql.replace('<table>', ex_table)
                    d.executemany_no_commit(sql, ex_values)
                d.commit()
            except:
                is_success = False
                self.rl.exception()
                self.rl.error(sql)
                d.rollback()
            finally:
                d.close()

        except:
            is_success = False
            self.rl.exception()
        finally:
            r.close_phandomjs()

        return super(self.__class__, self).result(is_success, [
            main_md5, self.url, a_href,
            in_values.__len__(),
            ex_values.__len__(), self.queue_id
        ])

    def success_action(self, values):
        delete_sql = """
            delete from hainiu_queue where id=%s;
        """
        update_hainiu_news_seed_sql = """
            update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5="%s";"""
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[5]
            sql = delete_sql % id
            # TODO 测试不删除队列表
            d.execute_no_commit(sql)
            sql = update_hainiu_news_seed_sql % (values[3], values[4],
                                                 values[0])
            d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()

    def fail_action(self, values):
        update_sql = """
            update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
        """
        update_sql_1 = """
            update hainiu_queue set type=1 where id=%s;
        """
        update_hainiu_news_seed_sql = """
            update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s";
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[5]
            u = Util()
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute_no_commit(sql)
            main_md5 = values[0]
            sql = update_hainiu_news_seed_sql % (ip, main_md5)
            d.execute_no_commit(sql)
            if (self.current_retry_times == Consumer._MAX_RETRY_TIMES):
                sql = update_sql_1 % (id)
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()