def fail_action(self, values): ''' 消息动作处理失败之后,更改队列中间件中该消息的失败次数并记录执行机器的IP 如果达到该机器的最大尝试失败次数,则更改队列中间件中该消息的状态为未处理,目的让其它机器再次尝试去处理该消息 :param values: 消息动作处理之后的结果 ''' update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set type=1 where id=%s """ try: d = DBUtil(config._HAINIU_DB) id = values[0] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) if (self.try_num == Consumer._WORK_TRY_NUM): sql = update_sql_1 % id d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close()
def fail_action(self, values): update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set type=1 where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[5] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) main_md5 = values[0] sql = update_hainiu_news_seed_sql % (ip, main_md5) d.execute_no_commit(sql) if (self.try_num == Consumer._WORK_TRY_NUM): sql = update_sql_1 % (id) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close()
def date_merge(): u = Util() fi = FileUtil() t = TimeUtil() s = SendSmsUtil() alter_time = t.now_time() beijing_now = datetime.now() now_time = int(time.mktime(beijing_now.timetuple())) tmp_path = config._LOCAL_DATA_DIR % ('%s/%s_%s.tmp' % ('tmp','hainiu', now_time)) up_path = config._LOCAL_DATA_DIR % ('%s/%s_%s.done' % ('up','hainiu', now_time)) start_char = '' for dirpath, dirnames, filenames in os.walk(config._LOCAL_DATA_DIR % ('done')): for filename in filenames: total = 0 merge_total = 0 dir = os.path.join(dirpath, filename) file_size = os.path.getsize(dir) record_list = [] with open(dir) as f: for line in f: try: total += 1 line = line.strip().encode('utf-8') if not line: continue md5 = line[:line.find('\001')] record = line[line.find('\001') + 1:] record_md5 = u.get_md5(record) if md5 == record_md5: merge_total += 1 record_list.append(record) else: raise Exception('check is faild') if record_list.__len__() >=10: fi.write_file_content_pattern(tmp_path,start_char + ('\n'.join(record_list)), pattern='a') record_list = [] start_char = '\n' except Exception: traceback.print_exc() print line alter_msg = 'alter merge api hainiu time:%s ip:%s' % (alter_time, u.get_local_ip()) s.send_sms(alter_msg) if record_list.__len__() >0: fi.write_file_content_pattern(tmp_path,start_char + ('\n'.join(record_list)), pattern='a') start_char = '\n' os.remove(dir) print dir,file_size,total,merge_total if os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 0: shutil.move(tmp_path, up_path)
def crawler_web_seed_url(url): ''' 爬取种子页的所有a链接 :param url: 种子页url :return: 无 ''' r = RequestUtil() hu = HtmlUtil() u = Util() # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(url) #html = html.decode('utf-8').encode(sys.getfilesystemdomainencoding()) #print html #可以从HTML或XML文件中提取数据的Python第三方库 soup = BeautifulSoup(html, 'lxml') # a链接dom对象列表 a_docs = soup.find_all("a") aset = set() #获取domain domain = hu.get_url_domain(url) #获取host host = hu.get_url_host(url) print 'domain==>', domain print 'host==>', host for a in a_docs: #获取a标签的href a_href = hu.get_format_url(url, a, host) #获取a标签的内容 a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if aset.__contains__(a_href): continue aset.add(a_href) #获取a标签的host a_host = hu.get_url_host(a_href) #获取a标签href链接url的md5 a_md5 = u.get_md5(a_href) #获取a标签所对应的xpath a_xpath = hu.get_dom_parent_xpath_js_new(a) print("%s\t%s\t%s\t%s\t%s") % (a_title.decode("utf-8"), a_href, a_host, a_md5, a_xpath) r.close_phandomjs()
def fail_action(self, values): ip = Util().get_local_ip() db_util = DBUtil(_HAINIU_DB) #1)记录队列表错误次数和ip; queue_update_sql1 = """ update hainiu_queue set fail_times=fail_times+1,fail_ip=%s where id=%s; """ #2)当某个机器的错误次数达到了当前机器设定的最大重试次数,把hainiu_queue 表对应的记录的 #is_work = 0,让其他机器重试; queue_update_sql2 = """ update hainiu_queue set is_work=0 where id=%s; """ #3)更新内链表的失败次数和失败ip,队列表的数据不删除; inner_update_sql = """ update hainiu_web_seed_internally set fail_times=fail_times+1,fail_ip=%s where md5=%s and a_md5=%s """ try: # 1) sql_params = [ip, values[0]] db_util.execute_no_commit(queue_update_sql1, sql_params) # 2) # 比较失败次数 if self.current_retry_num == _QUEUE_NEWS_FIND['C_RETRY_TIMES'] - 1: sql_params = [self.id] db_util.execute_no_commit(queue_update_sql2, sql_params) sql_params = [ip, values[1], values[2]] db_util.execute_no_commit(inner_update_sql, sql_params) db_util.commit() except Exception, e: db_util.rollback() traceback.print_exc(e)
def fail_action(self, values): ip = Util().get_local_ip() db_util = DBUtil(_HAINIU_DB) #1)记录hainiu_queue表错误次数和ip; # is_success,self.id,len(inner_list),len(exter_list),md5 queue_update_sql1 = """ update hainiu_queue set fail_times=fail_times+1,fail_ip=%s where id=%s; """ #2)当某个机器的错误次数达到了当前机器设定的最大重试次数,把hainiu_queue # 表对应的记录的 is_work = 0,让其他机器重试; queue_update_sql2 = """ update hainiu_queue set is_work=0 where id=%s; """ #3)更新种子表的失败次数、失败ip;队列表的数据不删除,有可能是因为目标网站把ip给封了, # 在某个时间,写个脚本,把失败的队列数据改状态和失败次数和失败ip,重新爬取试试。 seed_update_sql = """ update hainiu_web_seed set fail_times=fail_times+1,fail_ip=%s where md5=%s """ try: sql_params = [ip, values[0]] db_util.execute_no_commit(queue_update_sql1, sql_params) # 比较失败次数 if self.current_retry_num == _QUEUE_NEWS_FIND['C_RETRY_TIMES'] - 1: sql_params = [self.id] db_util.execute_no_commit(queue_update_sql2, sql_params) sql_params = [ip, values[3]] db_util.execute_no_commit(seed_update_sql, sql_params) db_util.commit() except Exception, e: traceback.print_exc(e) db_util.rollback()
def push_message(self, message): self.__lock.acquire() u = Util() producer = u.get_dict_value(self.__kafka_connect_cache, self.cache_key) if producer is None: client = KafkaClient(hosts=self.host) topic = client.topics[self.topic] producer = topic.get_producer() self.__kafka_connect_cache[self.cache_key] = producer is_success = True try: producer.produce(message) except: is_success = False del self.__kafka_connect_cache[self.cache_key] self.rl.error('kafka push error cacheKey is %s' % (self.cache_key)) self.rl.exception() self.__lock.release() return is_success
def test_beautiful(): # url = 'http://roll.news.qq.com' url ='http://politics.gmw.cn/node_9844.htm' r = RequestUtil() hu = HtmlUtil() html = r.http_get_phandomjs(url) domain = get_tld(url) host = hu.get_url_host(url) u = Util() print "domain:",domain,":host:",host soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") for a in a_docs: a_href = get_format_url(url,a,host) if a.text: print a.text if a_href: xpath = hu.get_dom_parent_xpath_js(a) print a_href,'_',xpath,u.get_md5(xpath)
def test_beautiful(): r = RequestUtil() hu = HtmlUtil() u = Util() url = 'https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1' html = r.http_get_phandomjs(url) #html = html.decode('utf-8').encode(sys.getfilesystemencoding()) #print html #可以从HTML或XML文件中提取数据的Python第三方库 soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") aset = set() #获取domain domain = get_fld(url) #获取host host = hu.get_url_host(url) print 'domain==>', domain print 'host==>', host for a in a_docs: #获取a标签的href a_href = get_format_url(url, a, host) #获取a标签的内容 a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if aset.__contains__(a_href): continue aset.add(a_href) #获取a标签的host a_host = hu.get_url_host(a_href) #获取a标签href链接url的md5 a_md5 = u.get_md5(a_href) #获取a标签所对应的xpath a_xpath = hu.get_dom_parent_xpath_js(a) print("%s\t%s\t%s\t%s\t%s") % (a_title.decode("utf-8"), a_href, a_host, a_md5, a_xpath)
def create_seed(): url = "https://www.autohome.com.cn/all" catetory = "汽车" sql = """ insert into hainiu_web_seed (url,md5,domain,host,category,status) values ('%s','%s','%s','%s','%s',0); """ hu = HtmlUtil() domain = get_tld(url) host = hu.get_url_host(url) u = Util() md5 = u.get_md5(url) rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) sql = sql % (url, md5, domain, host, catetory) d.execute(sql) except: rl.exception() d.rollback() finally: d.close()
def create_seed(): sql = """ insert into web_seed (url,md5,domain,host,category,status) values ('%s','%s','%s','%s','%s',0); """ url = "https://news.sina.com.cn/" catetory = "新闻" hu = HtmlUtil() domain = get_tld(url) host = hu.get_url_host(url) u = Util() md5 = u.get_md5(url) rl = LogUtil().get_base_logger() try: d = DBUtil(config._ZZ_DB) sql = sql % (url, md5, domain, host, catetory) d.execute(sql) except: rl.exception() d.rollback() finally: d.close()
def print_news_url_content(news_url): ''' 打印最终新闻页面内容 :param news_url: :return: ''' r = RequestUtil() hu = HtmlUtil() u = Util() # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(news_url) #html = html.decode('utf-8').encode(sys.getfilesystemdomainencoding()) print html r.close_phandomjs()
def queue_items(self): ''' 通过悲观锁+事务+更新状态来实现多个机器串行拿取数据, 并把其封装成HainiuConsumerAction对象实例列表返回 ''' select_sql = """ select id,action,params from hainiu_queue where type=%s and is_work=%s and fail_ip!=%s and fail_times<%s limit %s for update; """ # 更新SQL-拼字符串 update_sql = """ update hainiu_queue set is_work=1 where id in (%s); """ c_actions = [] # 用于装id,来更新 ids = [] db_util = DBUtil(_HAINIU_DB) try: # sql_params = [1, 0, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'], _QUEUE_NEWS_FIND['LIMIT_NUM']] # 屏蔽ip查询的参数 ip = Util().get_local_ip() sql_params = [ 1, 0, ip, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'], _QUEUE_NEWS_FIND['LIMIT_NUM'] ] # ({},{}) res1 = db_util.read_dict(select_sql, sql_params) for row in res1: id = row['id'] ids.append(str(id)) act = row['action'] params = row['params'] c_action = NewsFindConsumerAction(id, act, params) c_actions.append(c_action) if len(ids) > 0: db_util.execute_no_commit(update_sql % ",".join(ids)) db_util.commit() except Exception, e: db_util.rollback() traceback.print_exc(e)
def queue_items(self): # 屏蔽ip的查询方式 select_sql=''' select id, action, params from web_queue where type=%s and is_work=%s and fail_ip != %s and fail_times < %s limit 0, %s for update; ''' update_sql=''' update web_queue set is_work=1 where id in(%s); ''' db_util = DBUtil(_ZZ_DB) try: ip = Util().get_local_ip() sql_params = [1, 0, ip, _QUEUE_ZZ["MAX_FAIL_TIMES"], _QUEUE_ZZ["LIMIT_NUM"]] res = db_util.read_dict(select_sql, sql_params) actions = [] ids = [] for row in res: id = row["id"] ids.append(str(id)) action = row["action"] params = row["params"] # 封装对象 c_action = WebConsumerAction(id, action, params) actions.append(c_action) if len(actions) != 0: # 更新 is_work=1 db_util.execute_no_commit(update_sql % ",".join(ids)) db_util.commit() except Exception, err: actions = [] db_util.rollback() traceback.print_exc(err)
def fail_action(self, values): # 每次失败都需要更新ip 和 失败次数 update_sql1=''' update web_queue set fail_ip = %s , fail_times = fail_times + 1 where id = %s; ''' # 当失败次数到达每台机器的最大重试次数,就将该记录的is_work=0 ,让其重试 update_sql2=''' update web_queue set is_work = 0 where id = %s; ''' # 更新seed表状态 update_seed_sql = ''' update web_seed set fail_times=fail_times + 1,fail_ip=%s where md5 =%s; ''' # 更新externally表状态 update_exter_sql = ''' update web_seed_externally set fail_times=fail_times + 1,fail_ip=%s where a_md5 =%s; ''' db_util = DBUtil(_ZZ_DB) try: id = values[0] ip = Util().get_local_ip() # 每次更新失败ip 和失败次数 # queue表 sql_params = [ip, id] db_util.execute_no_commit(update_sql1, sql_params) # seed 表 sql_params = [ip, values[1]] db_util.execute(update_seed_sql, sql_params) # externally表 db_util.execute(update_exter_sql, sql_params) if self.current_retry_num == _QUEUE_ZZ["C_RETRY_TIMES"] - 1: db_util.execute_no_commit(update_sql2 % id) db_util.commit() except Exception,err: db_util.rollback() traceback.print_exc(err)
Copyright (c) 2019/3/16, 海牛学院版权所有. @author: 潘牛 ''' import mx.URL, sys from tld import get_tld from bs4 import BeautifulSoup from lxml import etree from commons.util.request_util import RequestUtil from commons.util.html_util import HtmlUtil from commons.util.util import Util if __name__ == '__main__': r = RequestUtil() hu = HtmlUtil() u = Util() url = 'https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1' html = r.http_get_phandomjs(url) dom_tree = etree.HTML(html) ###XPath匹配 a_text = dom_tree.xpath( "//div[@id='d_list']/ul[5]/li[2]/span[contains(@class,'c_tit')]/a[1]/text()" ) a_href = dom_tree.xpath("//div[@id='d_list']/ul[8]/li[3]/span[2]/a/@href") print a_text[0] print a_href[0] #--------本地测试----------------------- # myPage = '''<html>
def action(self): is_success = True t = TimeUtil() f = FileUtil() u = Util() hu = HtmlUtil() r = RequestUtil() values = [] md5 = u.get_md5(self.url) now_time = datetime.now() update_time = int(time.mktime(now_time.timetuple())) create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) now_minute = int(t.now_min()) for i in xrange(60, -5, -5): if now_minute >= i: now_minute = i break now_minute = t.now_time(format='%Y%m%d%H') + ( '0%s' % (str(now_minute)) if now_minute < 10 else str(now_minute)) values.append(MySQLdb.escape_string(self.url)) values.append(md5) values.append(create_time) values.append(create_day) values.append(create_hour) values.append('') values.append(MySQLdb.escape_string(self.param)) values.append(update_time) try: html = r.http_get_phandomjs(self.url) domain = get_tld(self.url) values[5] = domain soup = BeautifulSoup(html, 'lxml') title_doc = soup.find('title') title = title_doc.contents[0] if title_doc is not None and len( title_doc.contents) == 1 else '' host = hu.get_url_host(self.url) values.append(host) values.append(MySQLdb.escape_string(title)) # k = KafkaUtil(config._KAFKA_CONFIG) html = html.replace(content._SEQ1, '').replace(content._SEQ2, content._SEQ4) # push_str = content._SEQ3.join(('%s','%s')) % (self.url,html) # push_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(push_str),push_str) # push_str = bytes(push_str) # is_success = k.push_message(push_str) is_success = True if is_success: self.save_file(create_time, f, now_minute, u, self.url, html) else: values.append('') values.append('') self.rl.error("kafka push error") except: is_success = False values.append('') values.append('') self.rl.exception() finally: r.close_phandomjs() try: if is_success: values.append(1) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s",%s) on DUPLICATE KEY UPDATE update_time=values(update_time); """ else: ip = u.get_local_ip() values.append(ip) values.append(2) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,fail_ip,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s","%s",%s) on DUPLICATE KEY UPDATE fail_times=fail_times+1,fail_ip=values(fail_ip); """ d = DBUtil(config._HAINIU_DB) sql = insert_web_page_sql % tuple(values) d.execute(sql) except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close() return super(self.__class__, self).result(is_success, [md5, self.url, update_time, self.queue_id])
def action(self): is_success = True t = TimeUtil() u = Util() hu = HtmlUtil() r = RequestUtil() in_values = [] ex_values = [] a_href = '' main_md5 = u.get_md5(self.url) now_time = datetime.now() update_time = int(time.mktime(now_time.timetuple())) create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) try: html = r.http_get_phandomjs(self.url) domain = get_tld(self.url) soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") a_set = set() a_param = {} out_json_srt = '' status = 0 host = hu.get_url_host(self.url) for a in a_docs: a_href = self.get_format_url(a,host) a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if a_set.__contains__(a_href): continue a_set.add(a_href) req = urllib2.Request(url=a_href) a_host = req.get_host() if req.get_host() is not None else '' a_md5 = u.get_md5(a_href) if a_title != '': a_param['title'] = a_title out_json_srt = json.dumps(a_param,ensure_ascii=False) a_xpath = hu.get_dom_parent_xpath_js(a) insert_values = (main_md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status, MySQLdb.escape_string(self.url), MySQLdb.escape_string(a_href), MySQLdb.escape_string(a_title), out_json_srt) if a_host.__contains__(domain): in_values.append(insert_values) else: ex_values.append(insert_values) in_table = 'hainiu_web_seed_internally' ex_table = 'hainiu_web_seed_externally' insert_sql = """ insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=update_time; """ try: d = DBUtil(config._HAINIU_DB) d.execute_no_commit("set NAMES utf8mb4;") if in_values.__len__() != 0: sql = insert_sql.replace('<table>',in_table) d.executemany_no_commit(sql,in_values) if ex_values.__len__() != 0: sql = insert_sql.replace('<table>',ex_table) d.executemany_no_commit(sql,ex_values) d.commit() except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() except: is_success = False self.rl.exception() finally: r.close_phandomjs() return super(self.__class__, self).result(is_success, [main_md5,self.url,a_href,in_values.__len__(),ex_values.__len__(),self.queue_id])
def action(self): #爬取 hainiu_queue 中符合要求的url 请求页面的所有 a标签url r = RequestUtil() hu = HtmlUtil() u = Util() # is_success = True db_util = DBUtil(_HAINIU_DB) time_util = TimeUtil() # 内外链表的列表 inner_list = [] exter_list = [] #获取种子的md5 md5 = u.get_md5(self.act) try: # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(self.act) #可以从HTML或XML文件中提取数据的Python第三方库 soup = BeautifulSoup(html, 'lxml') # a链接dom对象列表 a_docs = soup.find_all("a") if len(a_docs) == 0: is_success = False aset = set() #获取种子的domain domain = hu.get_url_domain(self.act) #获取种子的host host = hu.get_url_host(self.act) # 时间(create_time、create_day、create_hour、update_time) # create_time=time_util.get_timestamp() # # create_day = int(time_util.now_day().replace('-', '')) # create_hour=int(time_util.now_hour()) # update_time=create_time create_time = time_util.get_timestamp() # 获取年月日格式 create_day = int(time_util.now_day(format='%Y%m%d')) # 获取小时 create_hour = int(time_util.now_hour()) update_time = create_time # params_json = json.dumps(self.params, ensure_ascii=False, encoding='utf-8') for a_doc in a_docs: #获取a标签的href a_href = hu.get_format_url(self.act, a_doc, host) #获取a标签的内容 a_title = a_doc.get_text().strip() if a_href == '' or a_title == '': continue if aset.__contains__(a_href): continue aset.add(a_href) #获取a标签的host a_host = hu.get_url_host(a_href) #获取a标签href链接url的md5 a_md5 = u.get_md5(a_href) #获取a标签所对应的xpath a_xpath = hu.get_dom_parent_xpath_js_new(a_doc) # 一行数据 row_data = (self.act, md5, self.params, domain, host, a_href, a_md5, a_host, a_xpath, a_title, create_time, create_day, create_hour, update_time) if a_href.__contains__(domain): inner_list.append(row_data) else: exter_list.append(row_data) # 并解析存入内链表或外链表,在存入时,如果url已存在,只做 # update 操作。(保证链接页面不会重复爬取) if len(inner_list) > 0: inner_insert_sql = """ insert into hainiu_web_seed_internally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time, create_day,create_hour,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time); """ db_util.executemany_no_commit(inner_insert_sql, inner_list) if len(exter_list) > 0: exter_insert_sql = """ insert into hainiu_web_seed_externally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time, create_day,create_hour,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time); """ db_util.executemany_no_commit(exter_insert_sql, exter_list) db_util.commit() except Exception, e: is_success = False db_util.rollback() traceback.print_exc(e)
def action(self): logger = LogUtil().get_logger("download_action", "download_action") #1)把队列中的url的HTML内容下载到文件中,每个消费线程每隔5分钟生成一个新的文件。 r = RequestUtil() # hu = HtmlUtil() u = Util() db_util = DBUtil(_HAINIU_DB) time_util = TimeUtil() # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(self.act) # 拼接要写入的内容 html = html.replace("\r", "").replace("\n", "\002") str1 = self.act + "\001" + html str2 = u.get_md5(str1) + "\001" + str1 # 成功失败标记 is_success = True # 获取时间 # now_time====>年月日时分秒 now_time = time.strftime("%Y%m%d,%H,%M,%S").split(",") day = now_time[0] hour = now_time[1] minute = int(now_time[2]) for i in range(60, -5, -5): if minute < i: continue minute = i break minute = '0%s' % minute if minute < 10 else minute now_minute = '%s%s%s' % (day, hour, minute) file_names = os.listdir(_LOCAL_DATA_DIR % ('tmp')) logger.info("file_names:%s" % file_names) thread_name = self.consumer_thread_name logger.info("thread_name:%s" % thread_name) last_file_name = '' for file_name in file_names: tmp = file_name.split("#")[0] if tmp == thread_name: last_file_name = file_name break now_file_name = "%s#%s" % (thread_name, now_minute) try: if last_file_name == '' or last_file_name != now_file_name: # 移动老文件 # if last_file_name != '': oldPath = _LOCAL_DATA_DIR % ("tmp/") + last_file_name logger.info("oldPath:%s" % oldPath) # if os.path.exists(oldPath) and os.path.getsize(oldPath) > 0: if last_file_name != '': done_file_name = last_file_name + "#" + str( TimeUtil().get_timestamp()) logger.info("last_file_name:%s" % last_file_name) newPath = _LOCAL_DATA_DIR % ("done/") + done_file_name logger.info("newPath:%s" % newPath) shutil.move(oldPath, newPath) # 写入新文件 now_file_name = _LOCAL_DATA_DIR % ("tmp/") + now_file_name # if not os.path.exists(_LOCAL_DATA_DIR+'tmp2/'): # os.mkdir(_LOCAL_DATA_DIR+'tmp2/') logger.info("now_file_name:%s" % now_file_name) f = open(now_file_name, 'a+') f.write(str2) f.close() else: last_file_name = _LOCAL_DATA_DIR % ("tmp/") + last_file_name logger.info("last_file_name:%s" % last_file_name) # 写入老文件时进行换行 insert_str = "\n" + str2 f = open(last_file_name, 'a+') f.write(insert_str) f.close() except Exception, e: is_success = False traceback.print_exc(e)
def action(self, *values): # 插入内链表sql语句 insert_seed_internally=''' insert into web_seed_internally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=VALUES(update_time); ''' # 插入外链表sql语句 insert_seed_externally=''' insert into web_seed_externally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=VALUES(update_time); ''' # 获取时间 a_time = TimeUtil() db_util = DBUtil(_ZZ_DB) # redis_d = RedisUtill() total_count = 0 in_count = 0 ex_count = 0 try: # 解析主网页信息 hu = HtmlUtil() domain = hu.get_url_domain(self.act) host = hu.get_url_host(self.act) u = Util() md5 = u.get_md5(self.act) # 解析a标签信息 r = RequestUtil() # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(self.act) # 可以从HTML或XML文件中提取数据的Python第三方库 soup = BeautifulSoup(html, 'lxml') # a链接dom对象列表 aset = set() # 获取host a_host = hu.get_url_host(self.act) # a_docs = soup.find_all("a",href=re.compile("^(/|.*"+domain+")")) a_docs = soup.find_all("a") for a in a_docs: total_count += 1 # 获取a标签的href a_url = hu.get_format_url(self.act,a,a_host) # 获取a标签的内容 a_title = a.get_text().strip() if a_url == '' or a_title == '': continue if aset.__contains__(a_url): continue aset.add(a_url) # 获取a标签的host a_host = hu.get_url_host(a_url) # 获取a标签href链接url的md5 a_md5 = u.get_md5(a_url) # 获取a标签所对应的xpath a_xpath = hu.get_dom_parent_xpath_js_new(a) create_time = a_time.get_timestamp() create_day = int(a_time.now_day(format='%Y%m%d')) create_hour = int(a_time.now_hour()) params_sql = [self.act,md5,self.params,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,create_time,0] if re.compile("^(/|.*"+domain+")").match(a_url) is not None: db_util.execute(insert_seed_internally, params_sql) # # # redis # redis_md5 = u.get_md5(md5+"\001"+a_md5) # find_key = redis_d.get_value_for_key('seed:%s:a_url' % redis_md5) # if find_key == None: # # url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status # dicts = {'seed:%s:param' % redis_md5 :self.params, 'seed:%s:a_url' % redis_md5 : a_url, # 'seed:%s:md5' % redis_md5 : md5, 'seed:%s:a_md5' % redis_md5 :a_md5} # # dicts_temp = {'seed_temp:%s:param' % redis_md5 :self.params,'seed_temp:%s:a_url' % redis_md5 : a_url, # 'seed_temp:%s:md5' % redis_md5 : md5, 'seed_temp:%s:a_md5' % redis_md5 : a_md5} # redis_d.set_batch_datas(dicts) # redis_d.set_batch_datas(dicts_temp) in_count += 1 else: db_util.execute(insert_seed_externally, params_sql) ex_count += 1 r.close_phandomjs() except Exception, err: db_util.rollback() traceback.print_exc(err)