Esempio n. 1
0
    def success_action(self):
        """
        删除队列的数据信息
        """
        sql = "delete from hainiu_queue where id=%s" % self.id

        try:
            db_util = DBUtil(db_config)

            db_util.execute(sql)
        except Exception, message:
            self.logger.exception(message)
Esempio n. 2
0
    def fail_action(self):
        """
        1)更新失败次数、设置失败ip
        2)如果失败次数达到了当前机器的最大失败次数,将is_work更新0;
        """

        update_sql1 = """
            update hainiu_queue set fail_times=fail_times+1, fail_ip='%s'  where id=%s and fail_times < %s;
        """

        update_sql2 = """
             update hainiu_queue set is_work=0  where id=%s;
        """

        try:
            db_util = DBUtil(db_config)
            u = Util()
            ip = u.get_local_ip()
            db_util.execute_no_commit(update_sql1 %
                                      (ip, self.id, self.max_fail_times))
            num_1 = self.current_retry_times + 1

            self.logger.info("self.current_retry_times1==> %d" % num_1)
            if self.current_retry_times +1 == Consumer._MAX_RETRY_TIMES \
                 and self.current_retry_times + 1 <self.max_fail_times:
                db_util.execute_no_commit(update_sql2 % self.id)

            db_util.commit()

        except Exception, message:
            self.logger.exception(message)
Esempio n. 3
0
    def queue_items(self):
        #多台机器的时候,查询带上 fail_ip != ip
        # select_sql = """
        # select id, action, params from hainiu_queue \
        # where type='1' and is_work = 0 and fail_ip != '%s' and  fail_times < %d limit 0, %d for update;
        # """

        #行锁
        select_sql = """
        select id, action, params from hainiu_queue \
        where type='1' and is_work = 0 and fail_times < %d limit 0, %d for update;
        """

        update_sql = """
        update hainiu_queue set is_work=1  where id in (%s);
        """
        list = []
        try:
            db_util = DBUtil(db_config)
            #多个行
            result = db_util.read_dict(select_sql %
                                       (self.max_fail_times, self.limit_num))
            ids = []

            for row_dict in result:
                id = row_dict['id']
                action = row_dict['action']
                params = row_dict['params']

                c_action = HainiuConsumerAction(id, action, params,
                                                self.max_fail_times)
                list.append(c_action)
                #[1,2,3,4]
                ids.append(str(id))

            if len(ids) != 0:
                ids = ','.join(ids)
                db_util.execute_no_commit(update_sql % ids)
            db_util.commit()
        except Exception, message:
            db_util.rollback_close()
            self.logger.exception(message)
Esempio n. 4
0
    def put_queue(self, show_num):
        select_count_sql = """
            select count(*) from hainiu_web_seed where status = 0;
        """
        select_limit_sql = """
            select url, category from hainiu_web_seed where status = 0 limit %s, %s;
        """

        insert_sql = """
            insert into hainiu_queue (type, action, params) values (%s, %s, %s);
        """
        db_util = DBUtil(db_config)
        try:
            #计算总数
            total_num = db_util.read_one(select_count_sql)
            #计算总页数
            page_num = total_num[0]/show_num if total_num[0] % show_num == 0 else total_num[0]/show_num + 1
            i = 0
            while i < page_num:
                limit_1 = i * show_num
                limit_2 = show_num
                print '%d , %d' % (limit_1, limit_2)
                sql = select_limit_sql % (limit_1, limit_2)
                print "select_limit_sql==> %s " % sql
                i += 1
                #分页查询结果
                result = db_util.read_dict(sql)
                values = []
                for row_dict in result:
                    url = row_dict['url']
                    category = row_dict['category']
                    #[(1, 'url1', 'c1'),(1,'url2','c2')]
                    values.append((1, url, category))

                print "insert values ==> %s" % values
                #将查询的结果进行批量insert插入
                db_util.executemany(insert_sql, values)

                #time.sleep(5)

        except Exception, message:

            traceback.print_exc(message)
Esempio n. 5
0
def push_queue_items():
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);"""
    count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;"""
    selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;"""
    update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info(
                'last download_page queue not finish,last queue %s unFinish' %
                (queue_total))
            return

        starttime = time.clock()
        d = DBUtil(config._HAINIU_DB)
        total = long(d.read_one(count_news_seed_internally_sql)[0])
        page_size = 2
        page = total / page_size
        for i in range(0, page + 1):
            sql = selec_news_seed_internally_sql % (0, page_size)
            list = d.read_tuple(sql)
            values = []
            id_values = []
            for l in list:
                url = l[0]
                url = url if url is not None else ''
                param = l[1]
                param1 = param if param is not None else ''

                id = l[2]

                param = '%s##%s' % (str(id), param1)
                values.append((url, param))

                id_values.append(str(id))
            if id_values.__len__() != 0:
                d.executemany_no_commit(
                    insert_news_seed_internally_queue_items_sql, values)
                ids = ','.join(id_values)
                sql = update_news_seed_internally_sql % (ids)
                d.execute(sql)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push seed_internally queue finish,total items %s,action time %s\'s'
            % (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
    def action(self):
        is_success = True
        t = TimeUtil()
        u = Util()
        hu = HtmlUtil()
        r = RequestUtil()

        redis_util = RedisUtill()
        redis_dict_values = {}
        #
        redis_dict_keys = []

        in_values = []
        ex_values = []
        a_href = ''
        main_md5 = u.get_md5(self.url)
        update_time = t.get_timestamp()
        print update_time
        create_time = update_time
        create_day = int(t.now_day().replace('-', ''))
        create_hour = int(t.now_hour())
        try:
            html = r.http_get_phandomjs(self.url)
            domain = hu.get_url_domain(self.url)

            soup = BeautifulSoup(html, 'lxml')
            a_docs = soup.find_all("a")
            a_set = set()
            a_param = {}
            out_json_srt = ''
            status = 0
            host = hu.get_url_host(self.url)

            for a in a_docs:
                a_href = hu.get_format_url(a, host)
                a_title = a.get_text().strip()
                if a_href == '' or a_title == '':
                    continue
                if a_set.__contains__(a_href):
                    continue
                a_set.add(a_href)

                req = urllib2.Request(url=a_href)
                a_host = req.get_host() if req.get_host() is not None else ''
                a_md5 = u.get_md5(a_href)

                if a_title != '':
                    a_param['title'] = a_title
                    out_json_srt = json.dumps(a_param, ensure_ascii=False)

                a_xpath = hu.get_dom_parent_xpath_js(a)
                insert_values = (main_md5, domain, host, a_md5, a_host,
                                 a_xpath, create_time, create_day, create_hour,
                                 update_time, status,
                                 MySQLdb.escape_string(self.url),
                                 MySQLdb.escape_string(a_href),
                                 MySQLdb.escape_string(a_title), out_json_srt)
                # print insert_values
                if a_host.__contains__(domain):
                    in_values.append(insert_values)

                    dict_exist_key = "exist:%s" % a_md5
                    redis_dict_values[dict_exist_key] = a_href
                    redis_dict_keys.append(dict_exist_key)
                else:
                    ex_values.append(insert_values)

            in_table = 'hainiu_web_seed_internally'
            ex_table = 'hainiu_web_seed_externally'
            insert_sql = """
                insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param)
                      values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time) ;
            """
            try:
                d = DBUtil(config._HAINIU_DB)
                #设置会话字符集为 utf8mb4
                d.execute_no_commit("set NAMES utf8mb4;")
                if in_values.__len__() != 0:
                    sql = insert_sql.replace('<table>', in_table)
                    d.executemany_no_commit(sql, in_values)

                    #拿key去redis查是否存在  exist:a_md5,得到这些key对应的values,也就是url列表
                    redis_exist_values = redis_util.get_values_batch_keys(
                        redis_dict_keys)
                    #将存在的values列表转换成exist:a_md5形式
                    redis_exist_keys = [
                        "exist:%s" % u.get_md5(rev)
                        for rev in redis_exist_values if rev != None
                    ]

                    #判断本次入库的数据中那些是在redis中存在的,如果不存在就生成down:a_md5   exits:a_md5这两个key放到redis中
                    redis_dict_down_values = {}
                    for key, value in redis_dict_values.items():
                        if key not in redis_exist_keys:
                            redis_dict_down_values["down:%s" %
                                                   u.get_md5(value)] = value
                            redis_dict_down_values[key] = value

                    if redis_dict_down_values.__len__() != 0:
                        redis_util.set_batch_datas(redis_dict_down_values)

                if ex_values.__len__() != 0:
                    sql = insert_sql.replace('<table>', ex_table)
                    d.executemany_no_commit(sql, ex_values)
                d.commit()
            except:
                is_success = False
                self.rl.exception()
                self.rl.error(sql)
                d.rollback()
            finally:
                d.close()

        except:
            is_success = False
            self.rl.exception()
        finally:
            r.close_phandomjs()

        return super(self.__class__, self).result(is_success, [
            main_md5, self.url, a_href,
            in_values.__len__(),
            ex_values.__len__(), self.queue_id
        ])
def push_queue_items():
    # 符合 写入的种子的队列数据的数量
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    # 生成写入队列数据 条件: type=3
    insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);"""
    # 日志
    rl = LogUtil().get_base_logger()

    redisdb = RedisUtill()
    try:

        # 开始时间
        starttime = time.clock()

        redis_data_statu = True
        # 线程锁
        lock_key = 'get_news_seed_internally_data'
        sql = ""
        total_all = 0

        d = DBUtil(config._HAINIU_DB)
        d.execute_no_commit("set NAMES utf8mb4;")
        #符合 写入的种子的队列数据的数量 --- 之前的队列数据还没有处理完,所以不重新写队列数据到队列中
        sql = count_news_seed_queue_sql

        queue_total = d.read_one(sql)[0]
        if queue_total != 0:
            rl.info(
                'last download_page queue not finish,last queue %s unFinish' %
                (queue_total))
            # return

        while redis_data_statu:

            is_lock = redisdb.get_conn().exists(lock_key)

            if is_lock == False:
                #锁上线程  --- 10 秒失效
                lockd = redisdb.get_lock(lock_key, 10)
                if lockd == False:
                    rl.info('无法获取线程锁,退出采集下载queue线程 ')
                    continue

                ips = config._REDIS_CLUSTER_CONFIG['IPS']
                port = config._REDIS_CLUSTER_CONFIG['PORT']

                def scan_limit_to_queue_table(host, port, cursor, match,
                                              count):
                    total_num = 0
                    r = redis.Redis(host, port)
                    rs = r.scan(cursor, match, count)
                    next_num = rs[0]
                    key_list = []
                    value_list = []
                    for k in rs[1]:
                        key_list.append(k)
                        total_num += 1

                    # print key_list
                    print total_num
                    values = redisdb.get_values_batch_keys(key_list)

                    for v in values:
                        value_list.append((v, ''))
                    print value_list

                    sql = insert_news_seed_internally_queue_items_sql
                    d.executemany(sql, value_list)

                    redisdb.delete_batch(rs[1])

                    if next_num == 0:
                        return total_num
                    return total_num + scan_limit_to_queue_table(
                        host, port, next_num, match, count)

                total_num = 0
                for ip in ips:
                    total_num += scan_limit_to_queue_table(
                        ip, port, 0, 'down:*', 10)
                    print '======'
                print total_num

                if total_num > 0:
                    break

                redisdb.release(lock_key)
            else:
                rl.info('其他线程正在处理,请等待 ')
                time.sleep(0.3)
        endtime = time.time()
        # 一共执行的时间
        worksec = int(round((endtime - starttime)))
        # 日志

        rl.info(
            'push seed_internally queue finish,total items %s,action time %s\'s'
            % (total_all, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        redisdb.release(lock_key)
        d.close()
 def fail_action(self, values):
     update_sql = """
         update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
     """
     update_sql_1 = """
         update hainiu_queue set type=1 where id=%s;
     """
     update_hainiu_news_seed_sql = """
         update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s";
     """
     try:
         d = DBUtil(config._HAINIU_DB)
         id = values[5]
         u = Util()
         ip = u.get_local_ip()
         sql = update_sql % (ip, id)
         d.execute_no_commit(sql)
         main_md5 = values[0]
         sql = update_hainiu_news_seed_sql % (ip, main_md5)
         d.execute_no_commit(sql)
         if (self.current_retry_times == Consumer._MAX_RETRY_TIMES):
             sql = update_sql_1 % (id)
             d.execute_no_commit(sql)
         d.commit()
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
         d.commit()
     finally:
         d.close()
    def queue_items(self):
        # select_queue_sql = """
        # select id,action,params from hainiu_queue where
        # type=1 and is_work =0 and fail_times <=%s and fail_ip <> '%s'
        # limit 0,%s for update;
        # """

        select_queue_sql = """
        select id,action,params from hainiu_queue where
        type=1 and is_work =0 and fail_times <=%s
        limit 0,%s for update;
        """

        update_queue_sql = """
        update hainiu_queue set is_work=1 where id in (%s);
        """

        list = []
        try:
            d = DBUtil(config._HAINIU_DB)
            sql = select_queue_sql % (self.fail_times, self.limit)
            tuple = d.read_tuple(sql)
            if len(tuple) == 0:
                return list
            queue_ids = ''
            for t in tuple:
                queue_id = t[0]
                url = t[1]
                param = '' if t[2] is None else t[2]
                queue_ids += str(queue_id) + ','
                c = NewsFindConsumer(url, param, queue_id)
                list.append(c)
            queue_ids = queue_ids[:-1]
            d.execute(update_queue_sql % (queue_ids))
        except:
            self.rl.exception()
            d.rollback()
            d.commit()
        finally:
            d.close()
        return list
Esempio n. 10
0
 def success_action(self, values):
     delete_sql = """
         delete from hainiu_queue where id=%s;
     """
     update_hainiu_news_seed_sql = """
         update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5="%s";"""
     try:
         d = DBUtil(config._HAINIU_DB)
         id = values[5]
         sql = delete_sql % id
         # TODO 测试不删除队列表
         d.execute_no_commit(sql)
         sql = update_hainiu_news_seed_sql % (values[3], values[4],
                                              values[0])
         d.execute_no_commit(sql)
         d.commit()
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
         d.commit()
     finally:
         d.close()
Esempio n. 11
0
def push_queue_items():
    count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;"""
    select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;"""
    insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);"""
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last news_find queue not finish,last queue %s unFinish' %
                    (queue_total))
            return

        starttime = time.clock()
        total = long(d.read_one(count_news_seed_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = select_news_seed_sql % (i * page_size, page_size)
            list = d.read_tuple(sql)
            values = []
            for l in list:
                url = l[0]
                publisher = get_fld(url)
                publisher = publisher[0:publisher.index((
                    '.'))] if publisher.__contains__('.') else publisher
                param = {}
                param['category'] = l[1]
                param['publisher'] = publisher
                param = json.dumps(param, ensure_ascii=False)
                values.append((url, param))

            if values.__len__() != 0:
                random.shuffle(values)
                d.executemany(insert_news_seed_queue_items_sql, values)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push news_find queue finish,total items %s,action time %s\'s' %
            (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
Esempio n. 12
0
    def action(self):
        is_success = True
        t = TimeUtil()
        file_util = FileUtil()
        u = Util()
        hu = HtmlUtil()
        r = RequestUtil()
        values = []
        md5 = u.get_md5(self.url)
        update_time = t.get_timestamp()
        create_time = update_time
        create_day = int(t.now_day().replace('-', ''))
        create_hour = int(t.now_hour())
        now_minute = int(t.now_min())
        #以5分钟为间隔时间计算
        for i in xrange(60,-5,-5):
            if now_minute>=i:
                now_minute=i
                break
        #格式化成yyyyMMddHHmm,如:201903181505
        now_minute = t.now_time(format='%Y%m%d%H') + ('0%s' % (str(now_minute)) if now_minute < 10 else str(now_minute))

        values.append(MySQLdb.escape_string(self.url))
        values.append(md5)
        values.append(create_time)
        values.append(create_day)
        values.append(create_hour)
        values.append('')
        values.append(MySQLdb.escape_string(self.param))
        values.append(update_time)
        try:
            html = r.http_get_phandomjs(self.url)
            domain = hu.get_url_domain(self.url)
            values[5] = domain

            soup = BeautifulSoup(html, 'lxml')
            title_doc = soup.find('title')
            title = title_doc.contents[0] if title_doc is not None and len(title_doc.contents) == 1 else ''

            host = hu.get_url_host(self.url)
            values.append(host)
            values.append(MySQLdb.escape_string(title))

            # k = KafkaUtil(config._KAFKA_CONFIG)
            # html = html.replace(content._SEQ1,'').replace(content._SEQ2,content._SEQ4)
            # push_str = content._SEQ3.join(('%s','%s')) % (self.url,html)
            # push_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(push_str),push_str)
            # push_str = bytes(push_str)
            # is_success = k.push_message(push_str)

            if is_success:
                self.save_file(create_time,file_util,now_minute,u,self.url,html)
            else:
                self.logger.error("kafka push error")

        except:
            is_success = False
            values.append('')
            values.append('')
            self.logger.exception()
        finally:
            r.close_phandomjs()

        try:
            if is_success:
                values.append(1)
                insert_web_page_sql = """
                    insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host,
                    title,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s",%s) on DUPLICATE KEY  UPDATE update_time=values(update_time);
                """
            else:
                ip = u.get_local_ip()
                values.append(ip)
                values.append(2)
                insert_web_page_sql = """
                    insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host,
                    title,fail_ip,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s","%s",%s)
                    on DUPLICATE KEY UPDATE fail_times=fail_times+1,fail_ip=values(fail_ip);
                """

            d = DBUtil(config._HAINIU_DB)
            sql = insert_web_page_sql % tuple(values)
            d.execute(sql)
        except:
            is_success = False
            self.logger.exception()
            self.logger.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()


        return super(self.__class__, self).result(is_success, [md5,update_time,self.queue_id])
Esempio n. 13
0
 def fail_action(self, values):
     update_sql = """
         update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
     """
     update_sql_1 = """
         update hainiu_queue set is_work=0 where id=%s;
     """
     update_hainiu_news_internally_sql = """
         update hainiu_web_seed_internally set fail_times=fail_times+1,fail_ip="%s",update_time=%s where a_md5="%s";
     """
     try:
         d = DBUtil(config._HAINIU_DB)
         id = values[2]
         u = Util()
         ip = u.get_local_ip()
         sql = update_sql % (ip, id)
         d.execute_no_commit(sql)
         sql = update_hainiu_news_internally_sql % (ip, values[1], values[0])
         d.execute_no_commit(sql)
         if (self.current_retry_times == Consumer._MAX_RETRY_TIMES):
             sql = update_sql_1 % (id)
             d.execute_no_commit(sql)
         d.commit()
     except:
         self.logger.exception()
         self.logger.error(sql)
         d.rollback()
         d.commit()
     finally:
         d.close()
Esempio n. 14
0
 def success_action(self, values):
     delete_sql = """
         delete from hainiu_queue where id=%s;
     """
     update_hainiu_news_internally_sql = """
         update hainiu_web_seed_internally set update_time=%s where a_md5="%s";
     """
     try:
         d = DBUtil(config._HAINIU_DB)
         id = values[2]
         sql = delete_sql % id
         # TODO 测试不删除队列表
         d.execute_no_commit(sql)
         sql = update_hainiu_news_internally_sql % (values[2],values[0])
         d.execute_no_commit(sql)
         d.commit()
     except:
         self.logger.exception()
         self.logger.error(sql)
         d.rollback()
         d.commit()
     finally:
         d.close()