Beispiel #1
0
 def __init__(self, id, action, params):
     super(self.__class__, self).__init__()
     self.id = id
     self.url = action
     self.params = params
     self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)
Beispiel #2
0
class NewsFindQueueConsumer(ConsumerAction):
    def __init__(self, id, action, params):
        super(self.__class__, self).__init__()
        self.id = id
        self.url = action
        self.params = params
        self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)

    def action(self):
        is_success = True
        try:
            #这里应该是进行消费,也就是把hainiu_web_seed送过来的链接进行爬取url,然后放到redis中
            # 插入两条数据,如果数据已经存在了就pass,如果数据不存在就插入hainiu_queue中
            rl = LogUtil().get_base_logger()
            try:
                print "进到消费者线程"
                call_beautiful(self.url)
            except:
                rl.exception()
            finally:
                pass
        except:
            is_success = False
            self.rl.exception()
        return super(self.__class__,
                     self).result(is_success, [self.id, self.url, self.params])

    def success_action(self, values):
        #成功之后应该删除hainiu_queue表中的数据,这里为了测试方便先修改状态,之后改成删除
        update_queue_sql = """
        update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s);
        """
        try:
            sql = update_queue_sql % (TimeUtil().now_time(), self.id)
            db = DBUtil(config._OGC_DB)
            db.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            db.rollback()
        finally:
            db.close()

    def fail_action(self, values):
        #失败之后恢复type为0,以便让其他线程继续访问
        update_sql = """
        update hainiu_web_seed set fail_times=fail_times+1,fail_ip='%s' where id=%s;
        """
        update_sql1 = """
        update hainiu_web_seed set status=0,last_crawl_time='' where id =%s
        """
        try:
            d = DBUtil(config._OGC_DB)
            id = values[0]
            u = Util
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute_no_commit(sql)
            if (self.try_num == Consumer.work_try_num):
                sql = update_sql1 % id
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.error(sql)
            self.rl.exception()
        finally:
            d.close()
Beispiel #3
0
 def __init__(self, limit, fail_times):
     super(self.__class__, self).__init__()
     self.limit = limit
     self.fail_times = fail_times
     self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)
Beispiel #4
0
def call_beautiful(url):
    '''
    给定url,获取
    :param url:
    :return:
    '''
    # url='http://roll.news.qq.com'
    r=RequestUtil()
    hu=HtmlUtil()
    t=TimeUtil()
    html=r.http_get_phandomjs(url)
    charset=hu.get_doc_charset(etree.HTML(html))
    domain=get_fld(url)
    host=hu.get_url_host(url)
    u=Util()
    rl=LogUtil().get_base_logger()
    print "domain:",domain,":host:",host
    soup=BeautifulSoup(html,'lxml')
    a_docs=soup.find_all("a")
    sql = ''
    try:
        db = DBUtil(config._OGC_DB)
        for a in a_docs:
            a_href=get_format_url(url,a,host,charset)
            if a_href and a.text:
                print a.text
                print a_href
                xpath=hu.get_dom_parent_xpath_js(a)
                create_time=int(t.str2timestamp(t.now_time()))
                create_day=int(t.now_day().replace("-",""))
                create_hour=int(t.now_hour())
                update_time=int(t.str2timestamp(t.now_time()))
                if get_fld(a_href)==domain:
                    print a_href
                    #说明是内链接,写入redis数据库
                    redis_conn=RedisUtil().get_conn()
                    redis=RedisUtil()
                    key1="exist:"+u.get_md5(a_href)
                    print redis_conn.keys(key1)
                    if not redis_conn.keys(key1):
                        key2="down:"+u.get_md5(a_href)
                        dicts = {key1:a_href, key2:a_href}
                        redis.set_batch_datas(dicts)
                        #同时写入mysql-internal数据库保存信息
                        insert_internal_sql="""
                        insert into hainiu_web_seed_internally (url,md5,param,domain,host,a_url,a_md5,
                        a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) 
                        values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s") on duplicate key update update_time=update_time +1;
                        """
                        sql=insert_internal_sql %(url,u.get_md5(url),"{title:"+a.text+"}",domain,host,a_href,u.get_md5(a_href),
                                                  hu.get_url_host(a_href),xpath,a.text,create_time,create_day,create_hour,update_time)
                        db.execute(sql)
                else:
                    #外连接写入mysql数据库,因为这部分只写,不会爬取
                    insert_external_sql="""
                    insert into hainiu_web_seed_externally (url,md5,param,domain,host,a_url,a_md5,
                            a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) 
                            values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s") on duplicate key update update_time=update_time +1;
                            """
                    sql = insert_external_sql % (url,u.get_md5(url),"{title:"+a.text+"}",domain,host,a_href,u.get_md5(a_href),
                                                 hu.get_url_host(a_href),xpath,a.text,create_time,create_day,create_hour,update_time)
                    db.execute(sql)
                # print a_href,'_',xpath,u.get_md5(xpath)
    except:
        rl.exception()
        rl.error(sql)
        db.rollback()
    finally:
        db.close()
Beispiel #5
0
 def __init__(self, text):
     super(self.__class__, self).__init__()
     self.text = text
     self.rl = LogUtil().get_base_logger()
class DownloadActionConsumer(ConsumerAction):
    def __init__(self, id, action, params):
        super(self.__class__, self).__init__()
        self.id = id
        self.url = action
        self.params = params
        self.rl = LogUtil().get_logger("consumer", "consumer" + queue_name)

    def action(self):
        is_success = True
        try:
            # 这里应该是进行消费,也就是把hainiu_queue送过来的链接进行爬取url,然后放到hainiu_web_page中
            #并且保存文件到本地,还有推到kafka中
            r = RequestUtil()
            hu = HtmlUtil()
            u = Util()
            f = FileUtil()
            t = TimeUtil()
            db = DBUtil(config._OGC_DB)
            html = r.http_get_phandomjs(self.url)
            r.close_phandomjs()
            charset = hu.get_doc_charset(etree.HTML(html))
            html = html.decode(charset).encode(sys.getfilesystemencoding())
            title = get_title(html).decode(sys.getfilesystemencoding())
            html_string = str(html).replace('\n', '').replace('\r\n', '')
            md5_html_string = u.get_md5(html_string)
            base_path = config._LOCAL_DATA_DIR % os.sep + 'done'
            file_path = config._LOCAL_DATA_DIR % os.sep + 'done' + os.sep + md5_html_string
            # 写文件
            f.create_path(base_path)
            f.write_file_content(file_path,
                                 md5_html_string + "\001" + html_string)
            # 推kafka
            kafka_util = KafkaUtil(config._KAFKA_CONFIG)
            kafka_util.push_message(html_string)
            try:
                #把结果记录写入hianiu_web_page中
                insert_web_page_sql = """
                insert into hainiu_web_page (url,md5,param,domain,host,title,create_time,
                create_day,create_hour,update_time) values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s");
                """
                create_time = int(t.str2timestamp(t.now_time()))
                create_day = int(t.now_day().replace("-", ""))
                create_hour = int(t.now_hour())
                update_time = int(t.str2timestamp(t.now_time()))
                sql = insert_web_page_sql % (
                    self.url, md5_html_string, "{title:" + self.params + "}",
                    get_fld(self.url), hu.get_url_host(self.url), title,
                    create_time, create_day, create_hour, update_time)
                db.execute(sql)
            except:
                self.rl.exception()
                self.rl.error(sql)
                db.rollback()
            finally:
                db.close()
        except:
            is_success = False
            self.rl.exception()
        return super(self.__class__,
                     self).result(is_success, [self.id, self.url, self.params])

    def success_action(self, values):
        # 成功了就把hainiu_queue的记录删除
        delete_queue_sql = """
        delete from hainiu_queue where id in (%s);
        """
        try:
            sql = delete_queue_sql % values[0]
            db = DBUtil(config._OGC_DB)
            db.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            db.rollback()
        finally:
            db.close()

    def fail_action(self, values):
        print "come in fail_action"
        #失败了就将记录type恢复为2,并累加fail_times
        update_sql = """
                update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
                """
        update_sql1 = """
                update hainiu_queue set type=2 where id =%s
                """
        try:
            d = DBUtil(config._OGC_DB)
            id = values[0]
            u = Util()
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute(sql)
            d.execute_no_commit(sql)
            #超过单机器尝试次数,工作状态置为不工作
            if (self.try_num == Consumer._WORK_TRY_NUM):
                sql = update_sql1 % id
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.error(sql)
            self.rl.exception()
        finally:
            d.close()