def action(self):
     is_success=True
     try:
         #这里应该是进行消费,也就是把hainiu_web_seed送过来的链接进行爬取url,然后放到hainiu_queue中
         #成功了就把hainiu_web_seed的status状态修改为0,一遍下一小时继续爬取
         print self.ac,self.params,self.id
         time.sleep(5)
         insert_sql = """
             insert into hainiu_queue(type,params,action) values (0,'%s','%s');
             """
         update_queue_sql = """
         update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s);
         """
         rl = LogUtil().get_base_logger()
         try:
             print "进到消费者线程"
             db = DBUtil(config._OGC_DB)
             print insert_sql
             print self.params,self.ac
             sql=insert_sql % (self.params,self.ac)
             print sql
             db.execute(sql)
         except:
             rl.exception()
             rl.error(insert_sql)
             rl.error(update_queue_sql)
             db.rollback()
         finally:
             db.close()
     except:
         is_success=False
         self.rl.exception()
     return super(self.__class__,self).result(is_success,[self.id,self.ac,self.params])
 def fail_action(self, values):
     print "come in fail_action"
     #失败了就将记录type恢复为2,并累加fail_times
     update_sql = """
             update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
             """
     update_sql1 = """
             update hainiu_queue set type=2 where id =%s
             """
     try:
         d = DBUtil(config._OGC_DB)
         id = values[0]
         u = Util()
         ip = u.get_local_ip()
         sql = update_sql % (ip, id)
         d.execute(sql)
         d.execute_no_commit(sql)
         #超过单机器尝试次数,工作状态置为不工作
         if (self.try_num == Consumer._WORK_TRY_NUM):
             sql = update_sql1 % id
             d.execute_no_commit(sql)
         d.commit()
     except:
         self.rl.error(sql)
         self.rl.exception()
     finally:
         d.close()
Exemple #3
0
 def success_action(self,values):
     #成功之后应该删除hainiu_queue表中的数据,这里为了测试方便先修改状态,之后改成删除
     # update_queue_sql = """
     # update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s);
     # """
     # try:
     #     sql = update_queue_sql % (TimeUtil().now_time(), self.id)
     #     db = DBUtil(config._OGC_DB)
     #     db.execute(sql)
     # except:
     #     self.rl.exception()
     #     self.rl.error(sql)
     #     db.rollback()
     # finally:
     #     db.close()
     delete_queue_sql = """
     delete from hainiu_queue where id in (%s);
     """
     try:
         sql = delete_queue_sql %  values[0]
         db = DBUtil(config._OGC_DB)
         db.execute(sql)
     except:
         self.rl.exception()
         self.rl.error(sql)
         db.rollback()
     finally:
         db.close()
Exemple #4
0
 def queue_items(self):
     select_queue_sql = """
     select id,action,params from hainiu_queue where 
     type=1 and is_work=0 and fail_times<=%s
     limit 0,%s for update;
     """
     update_queue_sql = """
     update hainiu_queue set is_work=1 where id in (%s);
     """
     return_list = []
     try:
         d = DBUtil(config._OGC_DB)
         sql = select_queue_sql % (self.fail_times, self.limit)
         select_dict = d.read_dict(sql)
         query_ids = []
         for record in select_dict:
             id = record['id']
             action = record['action']
             params = record['params']
             query_ids.append(str(id))
             c = OGCConsumer(id, action, params)
             return_list.append(c)
         if query_ids:
             ids = ','.join(query_ids)
             sql = update_queue_sql % ids
             d.execute(sql)
     except:
         self.rl.exception()
         self.rl.error()
         d.rollback()
     finally:
         d.close()
     return return_list
 def action(self):
     is_success = True
     try:
         # 这里应该是进行消费,也就是把hainiu_queue送过来的链接进行爬取url,然后放到hainiu_web_page中
         #并且保存文件到本地,还有推到kafka中
         r = RequestUtil()
         hu = HtmlUtil()
         u = Util()
         f = FileUtil()
         t = TimeUtil()
         db = DBUtil(config._OGC_DB)
         html = r.http_get_phandomjs(self.url)
         r.close_phandomjs()
         charset = hu.get_doc_charset(etree.HTML(html))
         html = html.decode(charset).encode(sys.getfilesystemencoding())
         title = get_title(html).decode(sys.getfilesystemencoding())
         html_string = str(html).replace('\n', '').replace('\r\n', '')
         md5_html_string = u.get_md5(html_string)
         base_path = config._LOCAL_DATA_DIR % os.sep + 'done'
         file_path = config._LOCAL_DATA_DIR % os.sep + 'done' + os.sep + md5_html_string
         # 写文件
         f.create_path(base_path)
         f.write_file_content(file_path,
                              md5_html_string + "\001" + html_string)
         # 推kafka
         kafka_util = KafkaUtil(config._KAFKA_CONFIG)
         kafka_util.push_message(html_string)
         try:
             #把结果记录写入hianiu_web_page中
             insert_web_page_sql = """
             insert into hainiu_web_page (url,md5,param,domain,host,title,create_time,
             create_day,create_hour,update_time) values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s");
             """
             create_time = int(t.str2timestamp(t.now_time()))
             create_day = int(t.now_day().replace("-", ""))
             create_hour = int(t.now_hour())
             update_time = int(t.str2timestamp(t.now_time()))
             sql = insert_web_page_sql % (
                 self.url, md5_html_string, "{title:" + self.params + "}",
                 get_fld(self.url), hu.get_url_host(self.url), title,
                 create_time, create_day, create_hour, update_time)
             db.execute(sql)
         except:
             self.rl.exception()
             self.rl.error(sql)
             db.rollback()
         finally:
             db.close()
     except:
         is_success = False
         self.rl.exception()
     return super(self.__class__,
                  self).result(is_success, [self.id, self.url, self.params])
Exemple #6
0
def push_queue_items():
    rl = LogUtil().get_base_logger()
    page_size = 10
    insert_seed_sql = """
        insert into hainiu_queue(type,params,action) values (0,%s,%s);
        """
    count_seed_sql = """
        select count(1) from hainiu_web_seed;
        """
    select_seed_sql = """
            select id,url,category,last_crawl_time from hainiu_web_seed where status=0
            limit %s,%s for update;
            """
    update_queue_sql = """
            update hainiu_web_seed set last_crawl_time='%s' where id in (%s);
            """
    t = TimeUtil()
    try:
        d = DBUtil(config._OGC_DB)
        queue_total = d.read_one(count_seed_sql)[0]
        page_num = queue_total / page_size + 1
        query_ids = []
        print page_num, page_size
        for i in range(0, page_num):
            sql = select_seed_sql % (i * page_size, page_size)
            select_list = d.read_tuple(sql)
            insert_list = []
            for record in select_list:
                id = record[0]
                url = record[1]
                category = record[2]
                last_crawl_time = str(record[3])
                if last_crawl_time is None or int(t.str2timestamp(last_crawl_time[:13], '%Y-%m-%d %H')) <= \
                        int(t.str2timestamp(t.get_dif_time(hour=-1, format='%Y-%m-%d %H'), format='%Y-%m-%d %H')):
                    # 进入这里的都是过去爬取的时间在一小时之前,或者没有爬取过
                    insert_list.append((category, url))
                    query_ids.append(str(id))
            d.executemany(insert_seed_sql, insert_list)
        if query_ids:
            ids = ','.join(query_ids)
            sql = update_queue_sql % (t.now_time(), ids)
            print t.now_time(), ids
            d.execute(sql)
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
 def success_action(self,values):
     print "success"
     update_queue_sql = """
     update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s);
     """
     try:
         sql = update_queue_sql % (TimeUtil().now_time(), values[0])
         db = DBUtil(config._OGC_DB)
         db.execute(sql)
     except:
         self.rl.exception()
         self.rl.error(sql)
         db.rollback()
     finally:
         db.close()
 def success_action(self, values):
     # 成功了就把hainiu_queue的记录删除
     delete_queue_sql = """
     delete from hainiu_queue where id in (%s);
     """
     try:
         sql = delete_queue_sql % values[0]
         db = DBUtil(config._OGC_DB)
         db.execute(sql)
     except:
         self.rl.exception()
         self.rl.error(sql)
         db.rollback()
     finally:
         db.close()
Exemple #9
0
 def success_action(self, values):
     delete_sql = """
     delete from hainiu_queue where id=%s
     """
     try:
         d = DBUtil(config._OGC_DB)
         id = values[0]
         sql = delete_sql % id
         d.execute(sql)
     except:
         self.rl.exception()
         self.rl.error()
         d.rollback()
     finally:
         d.close()
 def queue_items(self):
     ip=Util().get_local_ip()
     select_seed_sql="""
     select id,url,category,domain,host,last_crawl_time from hainiu_web_seed where 
     fail_times<=%s and locate('%s',fail_ip)=0 and status=0
     limit 0,%s for update;
     """
     update_queue_sql="""
     update hainiu_web_seed set status=1,last_crawl_time='%s' where id in (%s);
     """
     return_list=[]
     try:
         d=DBUtil(config._OGC_DB)
         sql=select_seed_sql % (self.fail_times,ip,self.limit)
         select_dict=d.read_dict(sql)
         # print select_dict
         query_ids=[]
         t=TimeUtil()
         for each in select_dict:
             id=each['id']
             url=each['url']
             category=each['category']
             domain=each['domain']
             host=each['host']
             last_crawl_time=str(each['last_crawl_time'])
             if last_crawl_time is None or int(t.str2timestamp(last_crawl_time[:13],'%Y-%m-%d %H'))<=\
                     int(t.str2timestamp(t.get_dif_time(hour=-1,format='%Y-%m-%d %H'),format='%Y-%m-%d %H')):
                 #进入这里的都是过去爬取的时间在一小时之前,或者没有爬取过
                 query_ids.append(str(id))
                 action=url
                 params=category
                 c = NewsFindActionConsumer(id, action, params)
                 return_list.append(c)
         if query_ids:
             ids=','.join(query_ids)
             sql=update_queue_sql % (t.now_time(),ids)
             print t.now_time(),ids
             d.execute(sql)
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
     finally:
         d.close()
     return return_list
Exemple #11
0
def create_seed():
    url="http://www.autohome.com.cn/all"
    category="汽车"
    sql="""
    insert into hainiu_web_seed (url,md5,domain,host,category,status) values 
    ('%s','%s','%s','%s','%s',0)
    """
    hu=HtmlUtil()
    domain=get_tld(url)
    host=hu.get_url_host(url)
    u=Util()
    md5=u.get_md5(url)
    rl=LogUtil().get_base_logger()
    try:
        d=DBUtil(config._OGC_DB)
        sql=sql % (url,md5,domain,host,category)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
Exemple #12
0
 def queue_items(self):
     ip = Util().get_local_ip()
     select_queue_sql = """
     select id,action,params from hainiu_queue where 
     type=0 and fail_times<=%s and locate('%s',fail_ip)=0
     limit 0,%s for update;
     """
     #type=1意思是url已经分配给消费者了
     update_queue_sql = """
     update hainiu_queue set type=1 where id in (%s);
     """
     return_list = []
     try:
         d = DBUtil(config._OGC_DB)
         sql = select_queue_sql % (self.fail_times, ip, self.limit)
         select_dict = d.read_dict(sql)
         print select_dict
         query_ids = []
         for each in select_dict:
             id = each['id']
             url = each['action']
             category = each['params']
             query_ids.append(str(id))
             c = NewsFindQueueConsumer(id, url, category)
             return_list.append(c)
         if query_ids:
             ids = ','.join(query_ids)
             sql = update_queue_sql % ids
             d.execute(sql)
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
     finally:
         d.close()
     return return_list
 def queue_items(self):
     ip = Util().get_local_ip()
     select_queue_sql = """
     select id,action,params from hainiu_queue where 
     fail_times<=%s and locate('%s',fail_ip)=0 and type=2
     limit 0,%s for update;
     """
     #type=3 已被消费者进程拿取过了
     update_queue_sql = """
     update hainiu_queue set type=3 where id in (%s);
     """
     return_list = []
     try:
         d = DBUtil(config._OGC_DB)
         sql = select_queue_sql % (self.fail_times, ip, self.limit)
         select_dict = d.read_dict(sql)
         query_ids = []
         t = TimeUtil()
         for each in select_dict:
             id = each['id']
             action = each['action']
             params = each['params']
             query_ids.append(str(id))
             c = DownloadActionConsumer(id, action, params)
             return_list.append(c)
         if query_ids:
             ids = ','.join(query_ids)
             sql = update_queue_sql % ids
             d.execute(sql)
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
     finally:
         d.close()
     return return_list
Exemple #14
0
def call_beautiful(url):
    '''
    给定url,获取
    :param url:
    :return:
    '''
    # url='http://roll.news.qq.com'
    r = RequestUtil()
    hu = HtmlUtil()
    t = TimeUtil()
    html = r.http_get_phandomjs(url)
    charset = hu.get_doc_charset(etree.HTML(html))
    domain = get_fld(url)
    host = hu.get_url_host(url)
    u = Util()
    rl = LogUtil().get_base_logger()
    print "domain:", domain, ":host:", host
    soup = BeautifulSoup(html, 'lxml')
    a_docs = soup.find_all("a")
    for a in a_docs:
        a_href = get_format_url(url, a, host, charset)
        if a_href and a.text:
            print a.text
            print a_href
            xpath = hu.get_dom_parent_xpath_js(a)
            create_time = int(t.str2timestamp(t.now_time()))
            create_day = int(t.now_day().replace("-", ""))
            create_hour = int(t.now_hour())
            update_time = int(t.str2timestamp(t.now_time()))
            if get_fld(a_href) == domain:
                print a_href
                #说明是内链接,写入redis数据库
                redis_conn = RedisUtil().get_conn()
                redis = RedisUtil()
                key1 = "exist:" + u.get_md5(a_href)
                print redis_conn.keys(key1)
                if not redis_conn.keys(key1):
                    key2 = "down:" + u.get_md5(a_href)
                    dicts = {key1: a_href, key2: a_href}
                    redis.set_batch_datas(dicts)
                    #同时写入mysql-internal数据库保存信息
                    try:
                        db = DBUtil(config._OGC_DB)
                        insert_internal_sql = """
                        insert into hainiu_web_seed_internally (url,md5,param,domain,host,a_url,a_md5,
                        a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) 
                        values("%s," * 13,"%s") on duplicate key update update_time=update_time +1;
                        """
                        #values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s");
                        sql = insert_internal_sql % (
                            url, u.get_md5(url), "{title:" + a.text + "}",
                            domain, host, a_href, u.get_md5(a_href),
                            hu.get_url_host(a_href), xpath, a.text,
                            create_time, create_day, create_hour, update_time)
                        db.execute(sql)
                    except:
                        rl.exception()
                        rl.error(sql)
                        db.rollback()
                    finally:
                        db.close()
            else:
                #外连接写入mysql数据库,因为这部分只写,不会爬取
                db = DBUtil(config._OGC_DB)
                insert_external_sql = """
                insert into hainiu_web_seed_externally (url,md5,param,domain,host,a_url,a_md5,
                        a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) 
                        values("%s," *13 ,"%s") on duplicate key update update_time=update_time +1;
                        """
                sql = insert_external_sql % (
                    url, u.get_md5(url), a.text, domain, host, a_href,
                    u.get_md5(a_href), hu.get_url_host(a_href), xpath, a.text,
                    create_time, create_day, create_hour, update_time)
                try:
                    db.execute(sql)
                except:
                    rl.exception()
                    rl.error(sql)
                    db.rollback()
                finally:
                    db.close()