def push_queue_items(): count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);""" count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;""" selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;""" update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info( 'last download_page queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() d = DBUtil(config._HAINIU_DB) total = long(d.read_one(count_news_seed_internally_sql)[0]) page_size = 2 page = total / page_size for i in range(0, page + 1): sql = selec_news_seed_internally_sql % (0, page_size) list = d.read_tuple(sql) values = [] id_values = [] for l in list: url = l[0] url = url if url is not None else '' param = l[1] param1 = param if param is not None else '' id = l[2] param = '%s##%s' % (str(id), param1) values.append((url, param)) id_values.append(str(id)) if id_values.__len__() != 0: d.executemany_no_commit( insert_news_seed_internally_queue_items_sql, values) ids = ','.join(id_values) sql = update_news_seed_internally_sql % (ids) d.execute(sql) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push seed_internally queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def push_queue_items(): count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;""" select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;""" insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);""" count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last news_find queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() total = long(d.read_one(count_news_seed_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = select_news_seed_sql % (i * page_size, page_size) list = d.read_tuple(sql) values = [] for l in list: url = l[0] publisher = get_fld(url) publisher = publisher[0:publisher.index(( '.'))] if publisher.__contains__('.') else publisher param = {} param['category'] = l[1] param['publisher'] = publisher param = json.dumps(param, ensure_ascii=False) values.append((url, param)) if values.__len__() != 0: random.shuffle(values) d.executemany(insert_news_seed_queue_items_sql, values) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push news_find queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def push_queue_items(): # 符合 写入的种子的队列数据的数量 count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" # 生成写入队列数据 条件: type=3 insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);""" # 日志 rl = LogUtil().get_base_logger() redisdb = RedisUtill() try: # 开始时间 starttime = time.clock() redis_data_statu = True # 线程锁 lock_key = 'get_news_seed_internally_data' sql = "" total_all = 0 d = DBUtil(config._HAINIU_DB) d.execute_no_commit("set NAMES utf8mb4;") #符合 写入的种子的队列数据的数量 --- 之前的队列数据还没有处理完,所以不重新写队列数据到队列中 sql = count_news_seed_queue_sql queue_total = d.read_one(sql)[0] if queue_total != 0: rl.info( 'last download_page queue not finish,last queue %s unFinish' % (queue_total)) # return while redis_data_statu: is_lock = redisdb.get_conn().exists(lock_key) if is_lock == False: #锁上线程 --- 10 秒失效 lockd = redisdb.get_lock(lock_key, 10) if lockd == False: rl.info('无法获取线程锁,退出采集下载queue线程 ') continue ips = config._REDIS_CLUSTER_CONFIG['IPS'] port = config._REDIS_CLUSTER_CONFIG['PORT'] def scan_limit_to_queue_table(host, port, cursor, match, count): total_num = 0 r = redis.Redis(host, port) rs = r.scan(cursor, match, count) next_num = rs[0] key_list = [] value_list = [] for k in rs[1]: key_list.append(k) total_num += 1 # print key_list print total_num values = redisdb.get_values_batch_keys(key_list) for v in values: value_list.append((v, '')) print value_list sql = insert_news_seed_internally_queue_items_sql d.executemany(sql, value_list) redisdb.delete_batch(rs[1]) if next_num == 0: return total_num return total_num + scan_limit_to_queue_table( host, port, next_num, match, count) total_num = 0 for ip in ips: total_num += scan_limit_to_queue_table( ip, port, 0, 'down:*', 10) print '======' print total_num if total_num > 0: break redisdb.release(lock_key) else: rl.info('其他线程正在处理,请等待 ') time.sleep(0.3) endtime = time.time() # 一共执行的时间 worksec = int(round((endtime - starttime))) # 日志 rl.info( 'push seed_internally queue finish,total items %s,action time %s\'s' % (total_all, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: redisdb.release(lock_key) d.close()
class NewsFindConsumer(ConsumerAction): def __init__(self, url, param, queue_id): ConsumerAction.__init__(self) self.url = url[:-1] if url.endswith('/') else url self.param = param self.queue_id = queue_id self.rl = LogUtil().get_logger('NewsFindConsumer', 'NewsFindConsumer') def action(self): is_success = True t = TimeUtil() u = Util() hu = HtmlUtil() r = RequestUtil() redis_util = RedisUtill() redis_dict_values = {} # redis_dict_keys = [] in_values = [] ex_values = [] a_href = '' main_md5 = u.get_md5(self.url) update_time = t.get_timestamp() print update_time create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) try: html = r.http_get_phandomjs(self.url) domain = hu.get_url_domain(self.url) soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") a_set = set() a_param = {} out_json_srt = '' status = 0 host = hu.get_url_host(self.url) for a in a_docs: a_href = hu.get_format_url(a, host) a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if a_set.__contains__(a_href): continue a_set.add(a_href) req = urllib2.Request(url=a_href) a_host = req.get_host() if req.get_host() is not None else '' a_md5 = u.get_md5(a_href) if a_title != '': a_param['title'] = a_title out_json_srt = json.dumps(a_param, ensure_ascii=False) a_xpath = hu.get_dom_parent_xpath_js(a) insert_values = (main_md5, domain, host, a_md5, a_host, a_xpath, create_time, create_day, create_hour, update_time, status, MySQLdb.escape_string(self.url), MySQLdb.escape_string(a_href), MySQLdb.escape_string(a_title), out_json_srt) # print insert_values if a_host.__contains__(domain): in_values.append(insert_values) dict_exist_key = "exist:%s" % a_md5 redis_dict_values[dict_exist_key] = a_href redis_dict_keys.append(dict_exist_key) else: ex_values.append(insert_values) in_table = 'hainiu_web_seed_internally' ex_table = 'hainiu_web_seed_externally' insert_sql = """ insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time) ; """ try: d = DBUtil(config._HAINIU_DB) #设置会话字符集为 utf8mb4 d.execute_no_commit("set NAMES utf8mb4;") if in_values.__len__() != 0: sql = insert_sql.replace('<table>', in_table) d.executemany_no_commit(sql, in_values) #拿key去redis查是否存在 exist:a_md5,得到这些key对应的values,也就是url列表 redis_exist_values = redis_util.get_values_batch_keys( redis_dict_keys) #将存在的values列表转换成exist:a_md5形式 redis_exist_keys = [ "exist:%s" % u.get_md5(rev) for rev in redis_exist_values if rev != None ] #判断本次入库的数据中那些是在redis中存在的,如果不存在就生成down:a_md5 exits:a_md5这两个key放到redis中 redis_dict_down_values = {} for key, value in redis_dict_values.items(): if key not in redis_exist_keys: redis_dict_down_values["down:%s" % u.get_md5(value)] = value redis_dict_down_values[key] = value if redis_dict_down_values.__len__() != 0: redis_util.set_batch_datas(redis_dict_down_values) if ex_values.__len__() != 0: sql = insert_sql.replace('<table>', ex_table) d.executemany_no_commit(sql, ex_values) d.commit() except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() except: is_success = False self.rl.exception() finally: r.close_phandomjs() return super(self.__class__, self).result(is_success, [ main_md5, self.url, a_href, in_values.__len__(), ex_values.__len__(), self.queue_id ]) def success_action(self, values): delete_sql = """ delete from hainiu_queue where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5="%s";""" try: d = DBUtil(config._HAINIU_DB) id = values[5] sql = delete_sql % id # TODO 测试不删除队列表 d.execute_no_commit(sql) sql = update_hainiu_news_seed_sql % (values[3], values[4], values[0]) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close() def fail_action(self, values): update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set type=1 where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[5] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) main_md5 = values[0] sql = update_hainiu_news_seed_sql % (ip, main_md5) d.execute_no_commit(sql) if (self.current_retry_times == Consumer._MAX_RETRY_TIMES): sql = update_sql_1 % (id) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close()
class DownLoadConsumer(ConsumerAction): def __init__(self, url, param, queue_id, pro_flag, queue_name): ConsumerAction.__init__(self) self.url = url[:-1] if url.endswith('/') else url self.param = param self.queue_id = queue_id self.pro_flag = pro_flag self.queue_name = queue_name self.logger = LogUtil().get_logger('consumer', 'consumer' + queue_name) def action(self): is_success = True t = TimeUtil() file_util = FileUtil() u = Util() hu = HtmlUtil() r = RequestUtil() values = [] md5 = u.get_md5(self.url) update_time = t.get_timestamp() create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) now_minute = int(t.now_min()) #以5分钟为间隔时间计算 for i in xrange(60,-5,-5): if now_minute>=i: now_minute=i break #格式化成yyyyMMddHHmm,如:201903181505 now_minute = t.now_time(format='%Y%m%d%H') + ('0%s' % (str(now_minute)) if now_minute < 10 else str(now_minute)) values.append(MySQLdb.escape_string(self.url)) values.append(md5) values.append(create_time) values.append(create_day) values.append(create_hour) values.append('') values.append(MySQLdb.escape_string(self.param)) values.append(update_time) try: html = r.http_get_phandomjs(self.url) domain = hu.get_url_domain(self.url) values[5] = domain soup = BeautifulSoup(html, 'lxml') title_doc = soup.find('title') title = title_doc.contents[0] if title_doc is not None and len(title_doc.contents) == 1 else '' host = hu.get_url_host(self.url) values.append(host) values.append(MySQLdb.escape_string(title)) # k = KafkaUtil(config._KAFKA_CONFIG) # html = html.replace(content._SEQ1,'').replace(content._SEQ2,content._SEQ4) # push_str = content._SEQ3.join(('%s','%s')) % (self.url,html) # push_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(push_str),push_str) # push_str = bytes(push_str) # is_success = k.push_message(push_str) if is_success: self.save_file(create_time,file_util,now_minute,u,self.url,html) else: self.logger.error("kafka push error") except: is_success = False values.append('') values.append('') self.logger.exception() finally: r.close_phandomjs() try: if is_success: values.append(1) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s",%s) on DUPLICATE KEY UPDATE update_time=values(update_time); """ else: ip = u.get_local_ip() values.append(ip) values.append(2) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,fail_ip,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s","%s",%s) on DUPLICATE KEY UPDATE fail_times=fail_times+1,fail_ip=values(fail_ip); """ d = DBUtil(config._HAINIU_DB) sql = insert_web_page_sql % tuple(values) d.execute(sql) except: is_success = False self.logger.exception() self.logger.error(sql) d.rollback() d.commit() finally: d.close() return super(self.__class__, self).result(is_success, [md5,update_time,self.queue_id]) def success_action(self, values): delete_sql = """ delete from hainiu_queue where id=%s; """ update_hainiu_news_internally_sql = """ update hainiu_web_seed_internally set update_time=%s where a_md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[2] sql = delete_sql % id # TODO 测试不删除队列表 d.execute_no_commit(sql) sql = update_hainiu_news_internally_sql % (values[2],values[0]) d.execute_no_commit(sql) d.commit() except: self.logger.exception() self.logger.error(sql) d.rollback() d.commit() finally: d.close() def fail_action(self, values): update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set is_work=0 where id=%s; """ update_hainiu_news_internally_sql = """ update hainiu_web_seed_internally set fail_times=fail_times+1,fail_ip="%s",update_time=%s where a_md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[2] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) sql = update_hainiu_news_internally_sql % (ip, values[1], values[0]) d.execute_no_commit(sql) if (self.current_retry_times == Consumer._MAX_RETRY_TIMES): sql = update_sql_1 % (id) d.execute_no_commit(sql) d.commit() except: self.logger.exception() self.logger.error(sql) d.rollback() d.commit() finally: d.close() def save_file(self, create_time, file_util, now_minute, u, url, html): #downloadnews_1_one_201903181505 # TODO 单机调试下载文件 # self.consumer_thread_name = "downloadnews" # html_file_path_cache[self.consumer_thread_name] = 'downloadnews_one_201903211115' now_file_name = '%s_%s_%s' % (self.consumer_thread_name, self.pro_flag, now_minute) #从文件缓存字典中根据当前线程名称获取 last_file_name last_file_name = u.get_dict_value(html_file_path_cache, self.consumer_thread_name) print 'last_file_name==>%s' % last_file_name print 'now_file_name==>%s' % now_file_name #再把now_file_name 作为当前线程的名称的value 放入字典 html_file_path_cache[self.consumer_thread_name] = now_file_name #/tmp/python/hainiu_cralwer/data/tmp/downloadnews_1_one tmp_path = config._LOCAL_DATA_DIR % ('%s/%s_%s' % ('tmp', self.consumer_thread_name, self.pro_flag)) #默认换行 start_char = content._SEQ2 #如果是首次写文件或者 写新文件,不换行 if last_file_name is None or now_file_name != last_file_name: start_char = '' #如果最后的文件存在且里面有数据,则mv到done目录下,并重命名 if os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 0: #/tmp/python/hainiu_cralwer/data/done/downloadnews_1_one_201903181505_1545376668 done_path = config._LOCAL_DATA_DIR % ('%s/%s_%s' % ('done', now_file_name, create_time)) shutil.move(tmp_path, done_path) #如果不是新文件,就继续往里写数据 html = html.replace(content._SEQ1,'').replace(content._SEQ2,content._SEQ4) record_str = content._SEQ3.join(('%s','%s')) % (url,html) record_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(record_str),record_str) html_record_format_str = start_char + record_str file_util.write_file_content_pattern(tmp_path, html_record_format_str, pattern='a')