def action(self):
     is_success=True
     try:
         #这里应该是进行消费,也就是把hainiu_web_seed送过来的链接进行爬取url,然后放到hainiu_queue中
         #成功了就把hainiu_web_seed的status状态修改为0,一遍下一小时继续爬取
         print self.ac,self.params,self.id
         time.sleep(5)
         insert_sql = """
             insert into hainiu_queue(type,params,action) values (0,'%s','%s');
             """
         update_queue_sql = """
         update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s);
         """
         rl = LogUtil().get_base_logger()
         try:
             print "进到消费者线程"
             db = DBUtil(config._OGC_DB)
             print insert_sql
             print self.params,self.ac
             sql=insert_sql % (self.params,self.ac)
             print sql
             db.execute(sql)
         except:
             rl.exception()
             rl.error(insert_sql)
             rl.error(update_queue_sql)
             db.rollback()
         finally:
             db.close()
     except:
         is_success=False
         self.rl.exception()
     return super(self.__class__,self).result(is_success,[self.id,self.ac,self.params])
def push_queue_items():
    insert_sql="""
    insert into hainiu_queue(type,params,action) values (1,%s,%s);
    """
    count_sql="""
    select count(1) from hainiu_web_seed;
    """
    select_sql="""
    select url,category from hainiu_web_seed limit %s,%s;
    """
    rl=LogUtil().get_base_logger()
    try:
        d=DBUtil(config._OGC_DB)
        sql=count_sql
        queue_total=d.read_one(sql)[0]
        print "queue total",queue_total
        page_size=1
        page=queue_total/page_size
        for i in range(0,page):
            sql=select_sql % (i*page_size,page_size)
            select_list=d.read_tuple(sql)
            print "page",i
            insert_list=[]
            for record in select_list:
                url=record[0]
                category=record[1]
                insert_list.append((category,url))
                print url,category
            d.executemany(insert_sql,insert_list)
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
Example #3
0
class KafkaUtil:
    __kafka_connect_cache = {}
    __lock = threading.Lock()

    #
    def __init__(self, kafka_conf):
        host_list = [host for host in kafka_conf['HOST'].split(',')]
        random.shuffle(host_list)
        host_str = ",".join(host_list)
        self.cache_key = "_".join((host_str, kafka_conf['TOPIC']))
        self.host = host_str
        self.topic = kafka_conf['TOPIC']
        self.rl = LogUtil().get_logger('consuer', 'consumer_kafka')

    #
    def push_message(self, message):
        self.__lock.acquire()
        u = Util()
        producer = u.get_dict_value(self.__kafka_connect_cache, self.cache_key)
        if producer is None:
            client = KafkaClient(hosts=self.host)
            topic = client.topics[self.topic]
            producer = topic.get_producer()
            self.__kafka_connect_cache[self.cache_key] = producer
        is_success = True
        try:
            producer.produce(message)
        except:
            is_success = False
            del self.__kafka_connect_cache[self.cache_key]
            self.rl.error('kafka push error chacheKey is %s' %
                          (self.cache_key))
            self.rl.exception()
        self.__lock.release()
        return is_success
Example #4
0
class Producer(threading.Thread):
    def __init__(self, queue, action, name, max_num, sleep_time,
                 work_sleep_time, work_try_num):
        super(self.__class__, self).__init__()
        self.queue = queue
        self.action = action
        self.name = name
        self.max_num = max_num
        self.sleep_time = sleep_time
        self.work_sleep_time = work_sleep_time
        self.work_try_num = work_try_num
        self.rl = LogUtil().get_logger("producer", "producer" + self.name)
        if not isinstance(action, base_producer_action.ProducerAction):
            raise Exception("action not extends producer base")

    def run(self):
        action_list = []
        while True:
            try:
                start_time = time.clock()
                if len(action_list) == 0:
                    action_list = self.action.queue_items()
                total_items = len(action_list)
                self.rl.info('get queue %s total items is %s ' %
                             (self.name, total_items))
                while True:
                    if len(action_list) == 0:
                        break
                    unfinished_tasks = self.queue.unfinished_tasks
                    if unfinished_tasks <= self.max_num:
                        action = action_list.pop()
                        self.queue.put(action)

                end_time = time.clock()
                work_time = int(round(end_time - start_time))
                work_mins = work_time / 60
                self.rl.info('put queue %s total items is %s ,total time is %s \'s,(at %s items per mins' % \
                              (self.name, total_items, work_time,
                               int(total_items) if work_mins == 0 else round(float(total_items / work_mins), 2)))
                time.sleep(self.sleep_time)
            except:
                self.rl.exception()

    def start_work(self):
        for i in range(0, self.max_num):
            qc = queue_consumer.Consumer(self.queue, self.name + "_" + str(i),
                                         self.work_sleep_time,
                                         self.work_try_num)
            qc.start()
        time.sleep(5)
        self.start()
class NewsFindActionProducer(ProducerAction):
    def __init__(self,limit,fail_times):
        super(self.__class__, self).__init__()
        self.limit = limit
        self.fail_times = fail_times
        self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)
    def queue_items(self):
        ip=Util().get_local_ip()
        select_seed_sql="""
        select id,url,category,domain,host,last_crawl_time from hainiu_web_seed where 
        fail_times<=%s and locate('%s',fail_ip)=0 and status=0
        limit 0,%s for update;
        """
        update_queue_sql="""
        update hainiu_web_seed set status=1,last_crawl_time='%s' where id in (%s);
        """
        return_list=[]
        try:
            d=DBUtil(config._OGC_DB)
            sql=select_seed_sql % (self.fail_times,ip,self.limit)
            select_dict=d.read_dict(sql)
            # print select_dict
            query_ids=[]
            t=TimeUtil()
            for each in select_dict:
                id=each['id']
                url=each['url']
                category=each['category']
                domain=each['domain']
                host=each['host']
                last_crawl_time=str(each['last_crawl_time'])
                if last_crawl_time is None or int(t.str2timestamp(last_crawl_time[:13],'%Y-%m-%d %H'))<=\
                        int(t.str2timestamp(t.get_dif_time(hour=-1,format='%Y-%m-%d %H'),format='%Y-%m-%d %H')):
                    #进入这里的都是过去爬取的时间在一小时之前,或者没有爬取过
                    query_ids.append(str(id))
                    action=url
                    params=category
                    c = NewsFindActionConsumer(id, action, params)
                    return_list.append(c)
            if query_ids:
                ids=','.join(query_ids)
                sql=update_queue_sql % (t.now_time(),ids)
                print t.now_time(),ids
                d.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()
        return return_list
Example #6
0
def push_queue_items():
    rl = LogUtil().get_base_logger()
    page_size = 10
    insert_seed_sql = """
        insert into hainiu_queue(type,params,action) values (0,%s,%s);
        """
    count_seed_sql = """
        select count(1) from hainiu_web_seed;
        """
    select_seed_sql = """
            select id,url,category,last_crawl_time from hainiu_web_seed where status=0
            limit %s,%s for update;
            """
    update_queue_sql = """
            update hainiu_web_seed set last_crawl_time='%s' where id in (%s);
            """
    t = TimeUtil()
    try:
        d = DBUtil(config._OGC_DB)
        queue_total = d.read_one(count_seed_sql)[0]
        page_num = queue_total / page_size + 1
        query_ids = []
        print page_num, page_size
        for i in range(0, page_num):
            sql = select_seed_sql % (i * page_size, page_size)
            select_list = d.read_tuple(sql)
            insert_list = []
            for record in select_list:
                id = record[0]
                url = record[1]
                category = record[2]
                last_crawl_time = str(record[3])
                if last_crawl_time is None or int(t.str2timestamp(last_crawl_time[:13], '%Y-%m-%d %H')) <= \
                        int(t.str2timestamp(t.get_dif_time(hour=-1, format='%Y-%m-%d %H'), format='%Y-%m-%d %H')):
                    # 进入这里的都是过去爬取的时间在一小时之前,或者没有爬取过
                    insert_list.append((category, url))
                    query_ids.append(str(id))
            d.executemany(insert_seed_sql, insert_list)
        if query_ids:
            ids = ','.join(query_ids)
            sql = update_queue_sql % (t.now_time(), ids)
            print t.now_time(), ids
            d.execute(sql)
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
def push_queue_items():
    redis = RedisUtil()
    rl = LogUtil().get_base_logger()
    # redis_key = redis.kyes_limit_scan(pattern="down*", limit=10)
    # print redis_key
    # type为2意思是从redis中获取的待下载链接,为3意思是已经被消费者拿去下载了
    insert_queue_sql = """
            insert into hainiu_queue(type,params,action) values (2,'from redis','%s');
            """
    try:
        db = DBUtil(config._OGC_DB)
        redis_len = len(redis.get_conn().keys())
        page_size = 10
        page_num = redis_len / page_size
        # redis_len = len(redis.get_conn().keys("down*"))
        # sum=0
        for i in range(0, page_num):
            redis_key = redis.kyes_limit_scan(pattern="down*",
                                              limit=page_size * (i + 1),
                                              cursor=0)
            if len(redis_key) != 0:
                redis_value = redis.get_values_batch_keys(redis_key)
                for each in redis_value:
                    print each
                    sql = insert_queue_sql % (each)
                    db.execute_no_commit(sql)
                db.commit()
                redis.delete_batch(redis_key)
            #避免后面无用的扫描
            # if sum==redis_len:
            #     break
        #下面是一下全取出来,没有分页的方法
        # redis_key=redis.get_conn().keys("down*")
        # print redis_key
        # if len(redis_key) !=0:
        #     redis_value = redis.get_values_batch_keys(redis_key)
        #     for each in redis_value:
        #         print redis_value
        #         sql=insert_queue_sql%(each[5:])
        #         db.execute_no_commit(sql)
        #     db.commit()
        #     redis.delete_batch(redis_key)
    except:
        rl.exception()
        rl.error(insert_queue_sql)
        db.rollback()
    finally:
        db.close()
Example #8
0
 def action(self):
     is_success=True
     try:
         #这里应该是进行消费,也就是把hainiu_web_seed送过来的链接进行爬取url,然后放到redis中
         # 插入两条数据,如果数据已经存在了就pass,如果数据不存在就插入hainiu_queue中
         rl = LogUtil().get_base_logger()
         try:
             print "come in consumer thread"
             call_beautiful(self.url)
         except:
             rl.exception()
         finally:
             pass
     except:
         is_success=False
         self.rl.exception()
     return super(self.__class__,self).result(is_success,[self.id,self.url,self.params])
Example #9
0
class NewsFindQueueProducer(ProducerAction):
    def __init__(self, limit, fail_times):
        super(self.__class__, self).__init__()
        self.limit = limit
        self.fail_times = fail_times
        self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)

    def queue_items(self):
        ip = Util().get_local_ip()
        select_queue_sql = """
        select id,action,params from hainiu_queue where 
        type=0 and fail_times<=%s and locate('%s',fail_ip)=0
        limit 0,%s for update;
        """
        #type=1意思是url已经分配给消费者了
        update_queue_sql = """
        update hainiu_queue set type=1 where id in (%s);
        """
        return_list = []
        try:
            d = DBUtil(config._OGC_DB)
            sql = select_queue_sql % (self.fail_times, ip, self.limit)
            select_dict = d.read_dict(sql)
            print select_dict
            query_ids = []
            for each in select_dict:
                id = each['id']
                url = each['action']
                category = each['params']
                query_ids.append(str(id))
                c = NewsFindQueueConsumer(id, url, category)
                return_list.append(c)
            if query_ids:
                ids = ','.join(query_ids)
                sql = update_queue_sql % ids
                d.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()
        return return_list
Example #10
0
class DownloadActionProducer(ProducerAction):
    def __init__(self, limit, fail_times):
        super(self.__class__, self).__init__()
        self.limit = limit
        self.fail_times = fail_times
        self.rl = LogUtil().get_logger("producer", "producer" + queue_name)

    def queue_items(self):
        ip = Util().get_local_ip()
        select_queue_sql = """
        select id,action,params from hainiu_queue where 
        fail_times<=%s and locate('%s',fail_ip)=0 and type=2
        limit 0,%s for update;
        """
        #type=3 已被消费者进程拿取过了
        update_queue_sql = """
        update hainiu_queue set type=3 where id in (%s);
        """
        return_list = []
        try:
            d = DBUtil(config._OGC_DB)
            sql = select_queue_sql % (self.fail_times, ip, self.limit)
            select_dict = d.read_dict(sql)
            query_ids = []
            t = TimeUtil()
            for each in select_dict:
                id = each['id']
                action = each['action']
                params = each['params']
                query_ids.append(str(id))
                c = DownloadActionConsumer(id, action, params)
                return_list.append(c)
            if query_ids:
                ids = ','.join(query_ids)
                sql = update_queue_sql % ids
                d.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()
        return return_list
Example #11
0
 def send_sms(self, content, phone=config._ALERT_PHONE):
     """send alter sms for phone with content
     """
     l = LogUtil().get_base_logger()
     try:
         send_url = 'http://send.sms.hainiu.com:8080/s?command=cralwer&phone=%s&' % (
             phone)
         send_url += urllib.urlencode(
             {'content': content.decode('utf-8').encode('gbk')})
         r = urllib2.urlopen(send_url).read()
         print "here01"
         if '0-OK' != r:
             print "here"
             l.error("短信发送失败,短信服务器返回状态为:%s,手机号:%s,内容:%s" %
                     (r, phone, content))
             return False
     except:
         l.exception()
         return False
     return True
Example #12
0
class OGCProducer(ProducerAction):
    def __init__(self, limit, fail_times):
        super(self.__class__, self).__init__()
        self.limit = limit
        self.fail_times = fail_times
        self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)

    def queue_items(self):
        select_queue_sql = """
        select id,action,params from hainiu_queue where 
        type=1 and is_work=0 and fail_times<=%s
        limit 0,%s for update;
        """
        update_queue_sql = """
        update hainiu_queue set is_work=1 where id in (%s);
        """
        return_list = []
        try:
            d = DBUtil(config._OGC_DB)
            sql = select_queue_sql % (self.fail_times, self.limit)
            select_dict = d.read_dict(sql)
            query_ids = []
            for record in select_dict:
                id = record['id']
                action = record['action']
                params = record['params']
                query_ids.append(str(id))
                c = OGCConsumer(id, action, params)
                return_list.append(c)
            if query_ids:
                ids = ','.join(query_ids)
                sql = update_queue_sql % ids
                d.execute(sql)
        except:
            self.rl.exception()
            self.rl.error()
            d.rollback()
        finally:
            d.close()
        return return_list
Example #13
0
def create_seed():
    url="http://www.autohome.com.cn/all"
    category="汽车"
    sql="""
    insert into hainiu_web_seed (url,md5,domain,host,category,status) values 
    ('%s','%s','%s','%s','%s',0)
    """
    hu=HtmlUtil()
    domain=get_tld(url)
    host=hu.get_url_host(url)
    u=Util()
    md5=u.get_md5(url)
    rl=LogUtil().get_base_logger()
    try:
        d=DBUtil(config._OGC_DB)
        sql=sql % (url,md5,domain,host,category)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
Example #14
0
class OGCConsumer(ConsumerAction):
    def __init__(self, id, ac, params):
        super(self.__class__, self).__init__()
        self.id = id
        self.ac = ac
        self.params = params
        self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)

    def action(self):
        is_success = True
        try:
            print self.ac, self.params
        except:
            is_success = False
            self.rl.exception()
        #这里是另外的写法
        return super(self.__class__,
                     self).result(is_success, [self.id, self.ac, self.params])

    def success_action(self, values):
        delete_sql = """
        delete from hainiu_queue where id=%s
        """
        try:
            d = DBUtil(config._OGC_DB)
            id = values[0]
            sql = delete_sql % id
            d.execute(sql)
        except:
            self.rl.exception()
            self.rl.error()
            d.rollback()
        finally:
            d.close()

    def fail_action(self, values):
        update_sql = """
        update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
        """
        update_sql1 = """
        update hainiu_queue set is_work=0 where id =%s
        """
        try:
            d = DBUtil(config._OGC_DB)
            id = values[0]
            u = Util()
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute_no_commit(sql)
            if (self.try_num == Consumer.work_try_num):
                sql = update_sql1 % id
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.error()
            self.rl.exception()
        finally:
            d.close()
Example #15
0
class OGCConsumerAction(base_consumer_action.ConsumerAction):
    def __init__(self, text):
        super(self.__class__, self).__init__()
        self.text = text
        self.rl = LogUtil().get_base_logger()

    def action(self):
        result = True
        r_test = ''
        try:
            #这里是消费动作
            r_test = "OGC" + str(self.text)
        except:
            result = False
            self.rl.exception()
        return self.result(result, [r_test])

    def fail_action(self, values):
        if self.try_num >= queue_consumer.Consumer._WORK_TRY_NUM:
            pass

    def success_action(self, values):
        pass
Example #16
0
class Consumer(threading.Thread):
    _WORK_TRY_NUM = 0

    def __init__(self, queue, name, sleep_time, work_try_num):
        super(self.__class__, self).__init__()
        self.queue = queue
        self.name = name
        self.sleep_time = sleep_time
        self.work_try_num = work_try_num
        Consumer._WORK_TRY_NUM = work_try_num
        self.rl = LogUtil().get_logger(
            'consumer', 'consumer' + self.name[:self.name.find("_")])

    def run(self):
        while True:
            try:
                # 这是一个阻塞方法
                action = self.queue.get()
                if not isinstance(action, base_consumer_action.ConsumerAction):
                    raise Exception("action not extends consumer base")
                sleep_time = random.randint(0, self.sleep_time * 10) * 0.1
                time.sleep(sleep_time)
                action.consumer_thread_name = self.name
                start_time = time.clock()
                re = action.action()
                end_time = time.clock()
                work_time = int(round(end_time - start_time))
                self.rl.info(("queue name %s finish,sleep time %s \'s,action time %s \'s"
                              "action retry %s times,result:%s") % \
                             (self.name, sleep_time, work_time, action.try_num, re.__str__() if re is not None else ""))
                if not re[0] and action.try_num < self.work_try_num:
                    action.try_num += 1
                    self.queue.put(action)
                self.queue.task_done()
            except:
                self.rl.exception()
Example #17
0
class DownloadActionConsumer(ConsumerAction):
    def __init__(self, id, action, params):
        super(self.__class__, self).__init__()
        self.id = id
        self.url = action
        self.params = params
        self.rl = LogUtil().get_logger("consumer", "consumer" + queue_name)

    def action(self):
        is_success = True
        try:
            # 这里应该是进行消费,也就是把hainiu_queue送过来的链接进行爬取url,然后放到hainiu_web_page中
            #并且保存文件到本地,还有推到kafka中
            r = RequestUtil()
            hu = HtmlUtil()
            u = Util()
            f = FileUtil()
            t = TimeUtil()
            db = DBUtil(config._OGC_DB)
            html = r.http_get_phandomjs(self.url)
            r.close_phandomjs()
            charset = hu.get_doc_charset(etree.HTML(html))
            html = html.decode(charset).encode(sys.getfilesystemencoding())
            title = get_title(html).decode(sys.getfilesystemencoding())
            html_string = str(html).replace('\n', '').replace('\r\n', '')
            md5_html_string = u.get_md5(html_string)
            base_path = config._LOCAL_DATA_DIR % os.sep + 'done'
            file_path = config._LOCAL_DATA_DIR % os.sep + 'done' + os.sep + md5_html_string
            # 写文件
            f.create_path(base_path)
            f.write_file_content(file_path,
                                 md5_html_string + "\001" + html_string)
            # 推kafka
            kafka_util = KafkaUtil(config._KAFKA_CONFIG)
            kafka_util.push_message(html_string)
            try:
                #把结果记录写入hianiu_web_page中
                insert_web_page_sql = """
                insert into hainiu_web_page (url,md5,param,domain,host,title,create_time,
                create_day,create_hour,update_time) values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s");
                """
                create_time = int(t.str2timestamp(t.now_time()))
                create_day = int(t.now_day().replace("-", ""))
                create_hour = int(t.now_hour())
                update_time = int(t.str2timestamp(t.now_time()))
                sql = insert_web_page_sql % (
                    self.url, md5_html_string, "{title:" + self.params + "}",
                    get_fld(self.url), hu.get_url_host(self.url), title,
                    create_time, create_day, create_hour, update_time)
                db.execute(sql)
            except:
                self.rl.exception()
                self.rl.error(sql)
                db.rollback()
            finally:
                db.close()
        except:
            is_success = False
            self.rl.exception()
        return super(self.__class__,
                     self).result(is_success, [self.id, self.url, self.params])

    def success_action(self, values):
        # 成功了就把hainiu_queue的记录删除
        delete_queue_sql = """
        delete from hainiu_queue where id in (%s);
        """
        try:
            sql = delete_queue_sql % values[0]
            db = DBUtil(config._OGC_DB)
            db.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            db.rollback()
        finally:
            db.close()

    def fail_action(self, values):
        print "come in fail_action"
        #失败了就将记录type恢复为2,并累加fail_times
        update_sql = """
                update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
                """
        update_sql1 = """
                update hainiu_queue set type=2 where id =%s
                """
        try:
            d = DBUtil(config._OGC_DB)
            id = values[0]
            u = Util()
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute(sql)
            d.execute_no_commit(sql)
            #超过单机器尝试次数,工作状态置为不工作
            if (self.try_num == Consumer._WORK_TRY_NUM):
                sql = update_sql1 % id
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.error(sql)
            self.rl.exception()
        finally:
            d.close()
class NewsFindActionConsumer(ConsumerAction):
    def __init__(self,id,action,params):
        super(self.__class__,self).__init__()
        self.id = id
        self.ac = action
        self.params = params
        self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)
    def action(self):
        is_success=True
        try:
            #这里应该是进行消费,也就是把hainiu_web_seed送过来的链接进行爬取url,然后放到hainiu_queue中
            #成功了就把hainiu_web_seed的status状态修改为0,一遍下一小时继续爬取
            print self.ac,self.params,self.id
            time.sleep(5)
            insert_sql = """
                insert into hainiu_queue(type,params,action) values (0,'%s','%s');
                """
            update_queue_sql = """
            update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s);
            """
            rl = LogUtil().get_base_logger()
            try:
                print "进到消费者线程"
                db = DBUtil(config._OGC_DB)
                print insert_sql
                print self.params,self.ac
                sql=insert_sql % (self.params,self.ac)
                print sql
                db.execute(sql)
            except:
                rl.exception()
                rl.error(insert_sql)
                rl.error(update_queue_sql)
                db.rollback()
            finally:
                db.close()
        except:
            is_success=False
            self.rl.exception()
        return super(self.__class__,self).result(is_success,[self.id,self.ac,self.params])
    def success_action(self,values):
        print "success"
        update_queue_sql = """
        update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s);
        """
        try:
            sql = update_queue_sql % (TimeUtil().now_time(), values[0])
            db = DBUtil(config._OGC_DB)
            db.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            db.rollback()
        finally:
            db.close()
    def fail_action(self,values):
        update_sql="""
        update hainiu_web_seed set fail_times=fail_times+1,fail_ip='%s' where id=%s;
        """
        #超过尝试次数就把工作状态设为不工作状态
        update_sql1="""
        update hainiu_web_seed set status=0,last_crawl_time='' where id =%s
        """
        try:
            d=DBUtil(config._OGC_DB)
            id=values[0]
            u=Util
            ip=u.get_local_ip()
            sql=update_sql % (ip,id)
            d.execute_no_commit(sql)
            if(self.try_num==Consumer.work_try_num):
                sql=update_sql1 % id
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.error(sql)
            self.rl.exception()
        finally:
            d.close()
Example #19
0
def call_beautiful(url):
    '''
    给定url,获取
    :param url:
    :return:
    '''
    # url='http://roll.news.qq.com'
    r = RequestUtil()
    hu = HtmlUtil()
    t = TimeUtil()
    html = r.http_get_phandomjs(url)
    charset = hu.get_doc_charset(etree.HTML(html))
    domain = get_fld(url)
    host = hu.get_url_host(url)
    u = Util()
    rl = LogUtil().get_base_logger()
    print "domain:", domain, ":host:", host
    soup = BeautifulSoup(html, 'lxml')
    a_docs = soup.find_all("a")
    for a in a_docs:
        a_href = get_format_url(url, a, host, charset)
        if a_href and a.text:
            print a.text
            print a_href
            xpath = hu.get_dom_parent_xpath_js(a)
            create_time = int(t.str2timestamp(t.now_time()))
            create_day = int(t.now_day().replace("-", ""))
            create_hour = int(t.now_hour())
            update_time = int(t.str2timestamp(t.now_time()))
            if get_fld(a_href) == domain:
                print a_href
                #说明是内链接,写入redis数据库
                redis_conn = RedisUtil().get_conn()
                redis = RedisUtil()
                key1 = "exist:" + u.get_md5(a_href)
                print redis_conn.keys(key1)
                if not redis_conn.keys(key1):
                    key2 = "down:" + u.get_md5(a_href)
                    dicts = {key1: a_href, key2: a_href}
                    redis.set_batch_datas(dicts)
                    #同时写入mysql-internal数据库保存信息
                    try:
                        db = DBUtil(config._OGC_DB)
                        insert_internal_sql = """
                        insert into hainiu_web_seed_internally (url,md5,param,domain,host,a_url,a_md5,
                        a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) 
                        values("%s," * 13,"%s") on duplicate key update update_time=update_time +1;
                        """
                        #values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s");
                        sql = insert_internal_sql % (
                            url, u.get_md5(url), "{title:" + a.text + "}",
                            domain, host, a_href, u.get_md5(a_href),
                            hu.get_url_host(a_href), xpath, a.text,
                            create_time, create_day, create_hour, update_time)
                        db.execute(sql)
                    except:
                        rl.exception()
                        rl.error(sql)
                        db.rollback()
                    finally:
                        db.close()
            else:
                #外连接写入mysql数据库,因为这部分只写,不会爬取
                db = DBUtil(config._OGC_DB)
                insert_external_sql = """
                insert into hainiu_web_seed_externally (url,md5,param,domain,host,a_url,a_md5,
                        a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) 
                        values("%s," *13 ,"%s") on duplicate key update update_time=update_time +1;
                        """
                sql = insert_external_sql % (
                    url, u.get_md5(url), a.text, domain, host, a_href,
                    u.get_md5(a_href), hu.get_url_host(a_href), xpath, a.text,
                    create_time, create_day, create_hour, update_time)
                try:
                    db.execute(sql)
                except:
                    rl.exception()
                    rl.error(sql)
                    db.rollback()
                finally:
                    db.close()
Example #20
0
class NewsFindQueueConsumer(ConsumerAction):
    def __init__(self, id, action, params):
        super(self.__class__, self).__init__()
        self.id = id
        self.url = action
        self.params = params
        self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)

    def action(self):
        is_success = True
        try:
            #这里应该是进行消费,也就是把hainiu_web_seed送过来的链接进行爬取url,然后放到redis中
            # 插入两条数据,如果数据已经存在了就pass,如果数据不存在就插入hainiu_queue中
            rl = LogUtil().get_base_logger()
            try:
                print "进到消费者线程"
                call_beautiful(self.url)
            except:
                rl.exception()
            finally:
                pass
        except:
            is_success = False
            self.rl.exception()
        return super(self.__class__,
                     self).result(is_success, [self.id, self.url, self.params])

    def success_action(self, values):
        #成功之后应该删除hainiu_queue表中的数据,这里为了测试方便先修改状态,之后改成删除
        update_queue_sql = """
        update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s);
        """
        try:
            sql = update_queue_sql % (TimeUtil().now_time(), self.id)
            db = DBUtil(config._OGC_DB)
            db.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            db.rollback()
        finally:
            db.close()

    def fail_action(self, values):
        #失败之后恢复type为0,以便让其他线程继续访问
        update_sql = """
        update hainiu_web_seed set fail_times=fail_times+1,fail_ip='%s' where id=%s;
        """
        update_sql1 = """
        update hainiu_web_seed set status=0,last_crawl_time='' where id =%s
        """
        try:
            d = DBUtil(config._OGC_DB)
            id = values[0]
            u = Util
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute_no_commit(sql)
            if (self.try_num == Consumer.work_try_num):
                sql = update_sql1 % id
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.error(sql)
            self.rl.exception()
        finally:
            d.close()