def fail_action(self, values):
     print "come in fail_action"
     #失败了就将记录type恢复为2,并累加fail_times
     update_sql = """
             update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
             """
     update_sql1 = """
             update hainiu_queue set type=2 where id =%s
             """
     try:
         d = DBUtil(config._OGC_DB)
         id = values[0]
         u = Util()
         ip = u.get_local_ip()
         sql = update_sql % (ip, id)
         d.execute(sql)
         d.execute_no_commit(sql)
         #超过单机器尝试次数,工作状态置为不工作
         if (self.try_num == Consumer._WORK_TRY_NUM):
             sql = update_sql1 % id
             d.execute_no_commit(sql)
         d.commit()
     except:
         self.rl.error(sql)
         self.rl.exception()
     finally:
         d.close()
Beispiel #2
0
def test_beautiful():
    # url='http://roll.news.qq.com'
    url = 'http://www.baidu.com'
    # url='http://roll.mil.news.sina.com.cn/col/zgjq/index/shtml'

    r = RequestUtil()
    hu = HtmlUtil()
    html = r.http_get_phandomjs(url)
    print get_title(html)
    # domain=get_tld(url)
    domain = get_fld(url)
    host = hu.get_url_host(url)
    u = Util()
    print "domain:", domain, ":host:", host
    soup = BeautifulSoup(html, 'lxml')

    print hu.get_doc_charset(etree.HTML(html)), "###################3"
    a_docs = soup.find_all("a")
    for a in a_docs:
        a_href = get_format_url(url, a, host)
        if a.text:
            print a.text
        if a_href:
            xpath = hu.get_dom_parent_xpath_js(a)
            print a_href, '_', xpath, u.get_md5(xpath)
 def action(self):
     is_success = True
     try:
         # 这里应该是进行消费,也就是把hainiu_queue送过来的链接进行爬取url,然后放到hainiu_web_page中
         #并且保存文件到本地,还有推到kafka中
         r = RequestUtil()
         hu = HtmlUtil()
         u = Util()
         f = FileUtil()
         t = TimeUtil()
         db = DBUtil(config._OGC_DB)
         html = r.http_get_phandomjs(self.url)
         r.close_phandomjs()
         charset = hu.get_doc_charset(etree.HTML(html))
         html = html.decode(charset).encode(sys.getfilesystemencoding())
         title = get_title(html).decode(sys.getfilesystemencoding())
         html_string = str(html).replace('\n', '').replace('\r\n', '')
         md5_html_string = u.get_md5(html_string)
         base_path = config._LOCAL_DATA_DIR % os.sep + 'done'
         file_path = config._LOCAL_DATA_DIR % os.sep + 'done' + os.sep + md5_html_string
         # 写文件
         f.create_path(base_path)
         f.write_file_content(file_path,
                              md5_html_string + "\001" + html_string)
         # 推kafka
         kafka_util = KafkaUtil(config._KAFKA_CONFIG)
         kafka_util.push_message(html_string)
         try:
             #把结果记录写入hianiu_web_page中
             insert_web_page_sql = """
             insert into hainiu_web_page (url,md5,param,domain,host,title,create_time,
             create_day,create_hour,update_time) values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s");
             """
             create_time = int(t.str2timestamp(t.now_time()))
             create_day = int(t.now_day().replace("-", ""))
             create_hour = int(t.now_hour())
             update_time = int(t.str2timestamp(t.now_time()))
             sql = insert_web_page_sql % (
                 self.url, md5_html_string, "{title:" + self.params + "}",
                 get_fld(self.url), hu.get_url_host(self.url), title,
                 create_time, create_day, create_hour, update_time)
             db.execute(sql)
         except:
             self.rl.exception()
             self.rl.error(sql)
             db.rollback()
         finally:
             db.close()
     except:
         is_success = False
         self.rl.exception()
     return super(self.__class__,
                  self).result(is_success, [self.id, self.url, self.params])
Beispiel #4
0
 def push_message(self, message):
     self.__lock.acquire()
     u = Util()
     producer = u.get_dict_value(self.__kafka_connect_cache, self.cache_key)
     if producer is None:
         client = KafkaClient(hosts=self.host)
         topic = client.topics[self.topic]
         producer = topic.get_producer()
         self.__kafka_connect_cache[self.cache_key] = producer
     is_success = True
     try:
         producer.produce(message)
     except:
         is_success = False
         del self.__kafka_connect_cache[self.cache_key]
         self.rl.error('kafka push error chacheKey is %s' %
                       (self.cache_key))
         self.rl.exception()
     self.__lock.release()
     return is_success
Beispiel #5
0
def create_seed():
    url="http://www.autohome.com.cn/all"
    category="汽车"
    sql="""
    insert into hainiu_web_seed (url,md5,domain,host,category,status) values 
    ('%s','%s','%s','%s','%s',0)
    """
    hu=HtmlUtil()
    domain=get_tld(url)
    host=hu.get_url_host(url)
    u=Util()
    md5=u.get_md5(url)
    rl=LogUtil().get_base_logger()
    try:
        d=DBUtil(config._OGC_DB)
        sql=sql % (url,md5,domain,host,category)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
Beispiel #6
0
 def fail_action(self, values):
     update_sql = """
     update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
     """
     update_sql1 = """
     update hainiu_queue set is_work=0 where id =%s
     """
     try:
         d = DBUtil(config._OGC_DB)
         id = values[0]
         u = Util()
         ip = u.get_local_ip()
         sql = update_sql % (ip, id)
         d.execute_no_commit(sql)
         if (self.try_num == Consumer.work_try_num):
             sql = update_sql1 % id
             d.execute_no_commit(sql)
         d.commit()
     except:
         self.rl.error()
         self.rl.exception()
     finally:
         d.close()
Beispiel #7
0
 def fail_action(self,values):
     #失败之后恢复type为0,以便让其他线程继续访问
     update_sql="""
     update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
     """
     update_sql1="""
     update hainiu_queue set type=0 where id =%s
     """
     try:
         d=DBUtil(config._OGC_DB)
         id=values[0]
         u=Util()
         ip=u.get_local_ip()
         sql=update_sql % (ip,id)
         d.execute_no_commit(sql)
         if(self.try_num==Consumer._WORK_TRY_NUM):
             sql=update_sql1 % id
             d.execute_no_commit(sql)
         d.commit()
     except:
         self.rl.error(sql)
         self.rl.exception()
     finally:
         d.close()
 def queue_items(self):
     ip=Util().get_local_ip()
     select_seed_sql="""
     select id,url,category,domain,host,last_crawl_time from hainiu_web_seed where 
     fail_times<=%s and locate('%s',fail_ip)=0 and status=0
     limit 0,%s for update;
     """
     update_queue_sql="""
     update hainiu_web_seed set status=1,last_crawl_time='%s' where id in (%s);
     """
     return_list=[]
     try:
         d=DBUtil(config._OGC_DB)
         sql=select_seed_sql % (self.fail_times,ip,self.limit)
         select_dict=d.read_dict(sql)
         # print select_dict
         query_ids=[]
         t=TimeUtil()
         for each in select_dict:
             id=each['id']
             url=each['url']
             category=each['category']
             domain=each['domain']
             host=each['host']
             last_crawl_time=str(each['last_crawl_time'])
             if last_crawl_time is None or int(t.str2timestamp(last_crawl_time[:13],'%Y-%m-%d %H'))<=\
                     int(t.str2timestamp(t.get_dif_time(hour=-1,format='%Y-%m-%d %H'),format='%Y-%m-%d %H')):
                 #进入这里的都是过去爬取的时间在一小时之前,或者没有爬取过
                 query_ids.append(str(id))
                 action=url
                 params=category
                 c = NewsFindActionConsumer(id, action, params)
                 return_list.append(c)
         if query_ids:
             ids=','.join(query_ids)
             sql=update_queue_sql % (t.now_time(),ids)
             print t.now_time(),ids
             d.execute(sql)
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
     finally:
         d.close()
     return return_list
Beispiel #9
0
 def queue_items(self):
     ip = Util().get_local_ip()
     select_queue_sql = """
     select id,action,params from hainiu_queue where 
     type=0 and fail_times<=%s and locate('%s',fail_ip)=0
     limit 0,%s for update;
     """
     #type=1意思是url已经分配给消费者了
     update_queue_sql = """
     update hainiu_queue set type=1 where id in (%s);
     """
     return_list = []
     try:
         d = DBUtil(config._OGC_DB)
         sql = select_queue_sql % (self.fail_times, ip, self.limit)
         select_dict = d.read_dict(sql)
         print select_dict
         query_ids = []
         for each in select_dict:
             id = each['id']
             url = each['action']
             category = each['params']
             query_ids.append(str(id))
             c = NewsFindQueueConsumer(id, url, category)
             return_list.append(c)
         if query_ids:
             ids = ','.join(query_ids)
             sql = update_queue_sql % ids
             d.execute(sql)
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
     finally:
         d.close()
     return return_list
Beispiel #10
0
 def queue_items(self):
     ip = Util().get_local_ip()
     select_queue_sql = """
     select id,action,params from hainiu_queue where 
     fail_times<=%s and locate('%s',fail_ip)=0 and type=2
     limit 0,%s for update;
     """
     #type=3 已被消费者进程拿取过了
     update_queue_sql = """
     update hainiu_queue set type=3 where id in (%s);
     """
     return_list = []
     try:
         d = DBUtil(config._OGC_DB)
         sql = select_queue_sql % (self.fail_times, ip, self.limit)
         select_dict = d.read_dict(sql)
         query_ids = []
         t = TimeUtil()
         for each in select_dict:
             id = each['id']
             action = each['action']
             params = each['params']
             query_ids.append(str(id))
             c = DownloadActionConsumer(id, action, params)
             return_list.append(c)
         if query_ids:
             ids = ','.join(query_ids)
             sql = update_queue_sql % ids
             d.execute(sql)
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
     finally:
         d.close()
     return return_list
Beispiel #11
0
def call_beautiful(url):
    '''
    给定url,获取
    :param url:
    :return:
    '''
    # url='http://roll.news.qq.com'
    r = RequestUtil()
    hu = HtmlUtil()
    t = TimeUtil()
    html = r.http_get_phandomjs(url)
    charset = hu.get_doc_charset(etree.HTML(html))
    domain = get_fld(url)
    host = hu.get_url_host(url)
    u = Util()
    rl = LogUtil().get_base_logger()
    print "domain:", domain, ":host:", host
    soup = BeautifulSoup(html, 'lxml')
    a_docs = soup.find_all("a")
    for a in a_docs:
        a_href = get_format_url(url, a, host, charset)
        if a_href and a.text:
            print a.text
            print a_href
            xpath = hu.get_dom_parent_xpath_js(a)
            create_time = int(t.str2timestamp(t.now_time()))
            create_day = int(t.now_day().replace("-", ""))
            create_hour = int(t.now_hour())
            update_time = int(t.str2timestamp(t.now_time()))
            if get_fld(a_href) == domain:
                print a_href
                #说明是内链接,写入redis数据库
                redis_conn = RedisUtil().get_conn()
                redis = RedisUtil()
                key1 = "exist:" + u.get_md5(a_href)
                print redis_conn.keys(key1)
                if not redis_conn.keys(key1):
                    key2 = "down:" + u.get_md5(a_href)
                    dicts = {key1: a_href, key2: a_href}
                    redis.set_batch_datas(dicts)
                    #同时写入mysql-internal数据库保存信息
                    try:
                        db = DBUtil(config._OGC_DB)
                        insert_internal_sql = """
                        insert into hainiu_web_seed_internally (url,md5,param,domain,host,a_url,a_md5,
                        a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) 
                        values("%s," * 13,"%s") on duplicate key update update_time=update_time +1;
                        """
                        #values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s");
                        sql = insert_internal_sql % (
                            url, u.get_md5(url), "{title:" + a.text + "}",
                            domain, host, a_href, u.get_md5(a_href),
                            hu.get_url_host(a_href), xpath, a.text,
                            create_time, create_day, create_hour, update_time)
                        db.execute(sql)
                    except:
                        rl.exception()
                        rl.error(sql)
                        db.rollback()
                    finally:
                        db.close()
            else:
                #外连接写入mysql数据库,因为这部分只写,不会爬取
                db = DBUtil(config._OGC_DB)
                insert_external_sql = """
                insert into hainiu_web_seed_externally (url,md5,param,domain,host,a_url,a_md5,
                        a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) 
                        values("%s," *13 ,"%s") on duplicate key update update_time=update_time +1;
                        """
                sql = insert_external_sql % (
                    url, u.get_md5(url), a.text, domain, host, a_href,
                    u.get_md5(a_href), hu.get_url_host(a_href), xpath, a.text,
                    create_time, create_day, create_hour, update_time)
                try:
                    db.execute(sql)
                except:
                    rl.exception()
                    rl.error(sql)
                    db.rollback()
                finally:
                    db.close()
Beispiel #12
0
def date_merge():
    u = Util()
    fi = FileUtil()
    t = TimeUtil()
    s = SendSmsUtil()
    alter_time = t.now_time()
    beijing_now = datetime.now()
    now_time = int(time.mktime(beijing_now.timetuple()))
    tmp_path = config._LOCAL_DATA_DIR % ('%s/%s_%s.tmp' %
                                         ('tmp', 'hainiu', now_time))
    up_path = config._LOCAL_DATA_DIR % ('%s/%s_%s.done' %
                                        ('up', 'hainiu', now_time))
    start_char = ''
    for dirpath, dirnames, filenames in os.walk(config._LOCAL_DATA_DIR %
                                                ('done')):
        for filename in filenames:
            total = 0
            merge_total = 0
            dir = os.path.join(dirpath, filename)
            file_size = os.path.getsize(dir)
            record_list = []
            with open(dir) as f:
                for line in f:
                    try:
                        total += 1
                        line = line.strip().encode('UNICODE')
                        if not line:
                            continue
                        md5 = line[:line.find('\001')]
                        record = line[line.find('\001') + 1:]
                        record_md5 = u.get_md5(record)
                        if md5 == record_md5:
                            merge_total += 1
                            record_list.append(record)
                        else:
                            raise Exception('check is faild')

                        if record_list.__len__() >= 10:
                            fi.write_file_content_pattern(
                                tmp_path,
                                start_char + ('\n'.join(record_list)),
                                pattern='a')
                            record_list = []
                            start_char = '\n'
                    except Exception:
                        traceback.print_exc()
                        print line
                        alter_msg = 'alter merge api hainiu time:%s ip:%s' % (
                            alter_time, u.get_local_ip())
                        s.send_sms(alter_msg)

            if record_list.__len__() > 0:
                fi.write_file_content_pattern(tmp_path,
                                              start_char +
                                              ('\n'.join(record_list)),
                                              pattern='a')
                start_char = '\n'

            os.remove(dir)
            print dir, file_size, total, merge_total

    if os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 0:
        shutil.move(tmp_path, up_path)