def run_task(self, task_param): mysql = Custom_MySQL(using='hadoops2') mysql_etl = Custom_MySQL(using='etl_manage') mysql.begin() mysql_etl.begin() where = {'id': int(task_param['id'])} try: hive = Custom_Hive(using='ares_dw') game = task_param['game'] platform = task_param['platform'] table_name = task_param['table_name'] log_date = task_param['log_date'] prefix_sql = task_param['prefix_sql'] exec_sql = task_param['exec_sql'] post_sql = task_param['post_sql'] date_cycle = task_param['date_cycle'] random_str = str(random.randint(0, 999999999)) stimes = str(int(time.time())) tmp_file_dir = "/tmp/tmp/%s/%s/%s" % (game, platform, log_date) #创建本地目录 if not os.path.exists(tmp_file_dir): os.makedirs(tmp_file_dir) tmp_file = "%s/%s_%s_%s_%s.txt" % (tmp_file_dir, table_name, date_cycle, random_str, stimes) hql_conf = "SET hive.support.concurrency=false;SET hive.exec.compress.output=true;" \ "SET mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec; " ''' 将任务标示为开始执行:1 ''' datas = { 'status': 1, 'start_time': str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) } mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() #执行前置sql,将数据临时写入本地,用于mysql load数据 if prefix_sql is not None: result = mysql.delete_by_sql(prefix_sql) logger.info('exec prefix_sql: delete old data {0}'.format( result['output'])) if result['status'] == 0: ''' 将任务标示为删除临时文件完成:2 ''' datas = {'status': 2} mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() ''' 开始执行hive ql,将数据dump到本地 ''' result = hive.dump(hql_conf + exec_sql, tmp_file) logger.info('exec exec_sql: dump data {0}'.format( result['output'])) if result['status'] == 0 and True == os.path.exists( '%s' % tmp_file): ''' 将任务标示为dump hive数据完成:3 ''' datas = {'status': 3, 'tmp_file_name': tmp_file} mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() #执行后置sql if post_sql is not None: post_sql = post_sql.replace('{dir_path}', tmp_file) post_sql = post_sql.replace('{table_name}', task_param['table_name']) post_sql = post_sql.replace('{db_name}', task_param['db_name']) result = mysql.load(post_sql) logger.info( 'exec post_sql: load data to hdfs {0}'.format( result['output'])) if result['status'] == 0: ''' 将任务标识为录入mysql完成:4 ''' datas = { 'status': 4, 'end_time': str(datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S')) } mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) else: logger.error( 'Error Code %s : %s Cmd: %s' % (result['status'], result['output'], post_sql)) else: logger.error( 'Error Code %s : %s Cmd: %s' % (result['status'], result['output'], exec_sql)) else: logger.error('Error Code %s : %s Cmd: %s' % (result['status'], result['output'], prefix_sql)) ''' 执行失败,将其状态标为未执行:0 ''' datas = {'status': 0} mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) ''' 将任务标示为:(模拟) 已从任务队列中移除 ''' datas = {'in_queue': 0} update_result = mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) # 如果数据库更新失败,再调用一次。 如果还是失败,等待自动修复机制,但这样的概率应该很小了。 if update_result != 1: mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() mysql.commit() mysql_etl.close() mysql.close() return True except Exception as exc: logger.error('dm2report error: %s' % exc) mysql_etl.rollback() mysql.rollback() datas = {'in_queue': 0, 'status': 0} mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() mysql_etl.close() mysql.close() raise self.retry(exc=exc, countdown=60)
def run_task(self, task_param): mysql = Custom_MySQL(using='hadoops2') mysql_etl = Custom_MySQL(using='etl_manage') redis = Custom_Redis(using='etl_task') mysql.begin() mysql_etl.begin() datas = {'status': 0} where = {'id': int(task_param['id'])} task_key_exc = "" try: hive = Custom_Hive(using='ares_dw') game = task_param['game'] platform = task_param['platform'] table_name = task_param['table_name'] log_date = task_param['log_date'] prefix_sql = task_param['prefix_sql'] exec_sql = task_param['exec_sql'] post_sql = task_param['post_sql'] date_cycle = task_param['date_cycle'] random_str = str(random.randint(0, 999999999)) stimes = str(int(time.time())) task_date = task_param['task_date'] task_name = task_param['task_name'] do_rate = task_param['do_rate'] #task_key_tmp = str(game)+str(platform)+str(task_name)+str(date_cycle)+str(do_rate)+str(log_date)+str(task_date) task_key_tmp = str(task_name) + str(date_cycle) + str(do_rate) + str( log_date) + str(task_date) task_key_md5 = hashlib.md5() task_key_md5.update(task_key_tmp) task_key_md5_result = task_key_md5.hexdigest() task_key = "dm2report_new_" + str(game) + "_" + str( platform) + "_" + str(task_key_md5_result) task_key_exc = task_key ''' celery 本身的机制存在漏洞,会将一个已经完成任务再次分配给其他的worker,致使同一个任务执行多次 为防止此种现象,在任务开始执行时,将任务的“唯一标示”写入redis中,标注已执行 ''' #如果task_key is None, 则表示该条任务没有执行过,正常执行即可 #如果task_key = 0, 则表示该条任务上次执行失败,允许重复执行 if redis.get(task_key) == "0" or redis.get(task_key) is None: tmp_file_dir = "/tmp/tmp/%s/%s/%s" % (game, platform, log_date) #创建本地目录 if not os.path.exists(tmp_file_dir): os.makedirs(tmp_file_dir) tmp_file = "%s/%s_%s_%s_%s.txt" % (tmp_file_dir, table_name, date_cycle, random_str, stimes) hql_conf = "SET hive.support.concurrency=false;" \ "SET hive.exec.compress.output=true;" \ "SET mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec; " ''' 将任务标示为开始执行:1 ''' datas['status'] = 1 datas['start_time'] = str( datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) mysql_etl.update('dm2report_new_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() #在redis标注 任务正在执行 redis.set(task_key, 1) #执行前置sql,将数据临时写入本地,用于mysql load数据 if prefix_sql is not None: result = mysql.delete_by_sql(prefix_sql) logger.info('exec prefix_sql: delete old data {0}'.format( result['output'])) if result['status'] != 0: logger.error( 'Error Code %s : %s Cmd: %s' % (result['status'], result['output'], prefix_sql)) ''' 执行失败,将其状态标为未执行:0 ''' datas['status'] = 0 #在redis标注 任务未开始执行 redis.set(task_key, 0) else: ''' 将任务标示为删除临时文件完成:2 ''' datas['status'] = 2 datas.pop('start_time') mysql_etl.update('dm2report_new_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() ''' 开始执行hive ql,将数据dump到本地 ''' result = hive.dump(hql_conf + exec_sql, tmp_file) logger.info('exec exec_sql: dump data {0}'.format( result['output'])) if result['status'] != 0 or False == os.path.exists( '%s' % tmp_file): logger.error( 'Error Code %s : %s Cmd: %s' % (result['status'], result['output'], exec_sql)) #在redis标注 任务未开始执行 redis.set(task_key, 0) else: ''' 将任务标示为dump hive数据完成:3 ''' datas['status'] = 3 datas['tmp_file_name'] = tmp_file mysql_etl.update('dm2report_new_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() #执行后置sql if post_sql is not None: post_sql = post_sql.replace('{dir_path}', tmp_file) post_sql = post_sql.replace( '{table_name}', task_param['table_name']) post_sql = post_sql.replace( '{db_name}', task_param['db_name']) result = mysql.load(post_sql) logger.info( 'exec post_sql: load data to hdfs {0}'.format( result['output'])) if result['status'] != 0: logger.error('Error Code %s : %s Cmd: %s' % (result['status'], result['output'], post_sql)) #在redis标注 任务未开始执行 redis.set(task_key, 0) else: ''' 将任务标识为录入mysql完成:4 ''' datas['status'] = 4 datas['end_time'] = str( datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S')) datas.pop('tmp_file_name') #在redis标注 任务已成功执行 redis.set(task_key, 2) else: logger.error('prefix_sql is null') datas['status'] = 0 #在redis标注 任务未开始执行 redis.set(task_key, 0) #如果task_key=2, 则标示该条任务已经运行成功 elif redis.get(task_key) == "2": datas['status'] = 4 #该条任务正在运行中 else: return True ''' 将任务标示为:(模拟) 已从任务队列中移除 ''' datas['in_queue'] = 0 update_result = mysql_etl.update('dm2report_new_log', ' id = %(id)d' % where, **datas) # 如果数据库更新失败,再调用一次。 如果还是失败,等待自动修复机制,但这样的概率应该很小了。 if update_result != 1: mysql_etl.update('dm2report_new_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() mysql.commit() mysql_etl.close() mysql.close() return True except Exception as exc: logger.error('dm2report error: %s' % exc) mysql_etl.rollback() mysql.rollback() redis.set(task_key_exc, 0) datas = {'in_queue': 0, 'status': 0} mysql_etl.update('dm2report_new_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() mysql_etl.close() mysql.close() raise self.retry(exc=exc, countdown=60)
def run_task(self, task_param): redis = Custom_Redis(using='etl_manage') mysql = Custom_MySQL(using='hadoops2') mysql_etl = Custom_MySQL(using='etl_manage') mysql.begin() mysql_etl.begin() where = {'id': int(task_param['id'])} datas = {'load_status': 0} try: ''' 业务代码块放下方 ''' dir_param = {'game': task_param['game'], 'platform': task_param['platform'], 'log_date': task_param['log_date'], 'log_name': task_param['log_name']} filename_dict = {'log_name': task_param['log_name'], 'log_time': task_param['log_time'], 'source_ip': task_param['source_ip']} prefix_sql = task_param['prefix_sql'] post_sql = task_param['post_sql'] log_dir = "/%(game)s/%(platform)s/%(log_date)s/%(log_name)s/" % dir_param file_name = "%(log_name)s_%(source_ip)s_%(log_time)s.txt" % filename_dict #从redis中,获取当前数据对应存储到哪块磁盘 if redis.get("disk_xml") is None: disk_tmp = open('/data/etl_manage/conf/disk_game.xml', 'r') redis.set("disk_xml", str(disk_tmp.read())) disk_list = str(redis.get("disk_xml")) root = ET.fromstring(disk_list) disk = "" for gameinfo in root.findall('game'): if gameinfo.get('name') == task_param['game']: disk = gameinfo.get('disk') continue #local_log_dir = '/disk1/tmp_data'+log_dir local_log_dir = '/'+disk+'/data'+log_dir #判断要录入的文件是否存在,如果存在则执行,否则不执行 if os.path.exists('%s%s' % (local_log_dir, file_name)): ''' 将任务标识为开始执行:1 ''' datas['load_status'] = 1 mysql_etl.update('file2mysql_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() logger.info('start load data') #执行前置sql if prefix_sql is not None and prefix_sql != '': mysql.query(prefix_sql) ''' 执行load数据进mysql ''' load_sql = task_param['load_sql'] load_sql = load_sql.replace('{dir_path}', local_log_dir+file_name) load_sql = load_sql.replace('{table_name}', task_param['table_name']) load_sql = load_sql.replace('{db_name}', task_param['db_name']) result = mysql.load(load_sql) logger.info('load data to mysql: {0}'.format(result['output'])) #判断录入mysql是否成功 if result['status'] == 0: #执行后置sql if post_sql is not None and post_sql != '': post_sql = post_sql.replace('{table_name}', task_param['table_name']) post_sql = post_sql.replace('{db_name}', task_param['db_name']) mysql.query(post_sql) ''' 将任务标识为录入mysql完成:3 ''' datas['load_status'] = 3 else: logger.error('Error Code %s : %s Cmd: %s' % (result['status'], result['output'], load_sql)) ''' 录入mysql失败,将任务标示为未执行:0 ''' datas['load_status'] = 0 ''' 将任务标示为:(模拟) 已从任务队列中移除 ''' datas['in_queue'] = 0 update_result = mysql_etl.update('file2mysql_log', ' id = %(id)d' % where, **datas) # 如果更新失败,则再调用一次,如果还是失败,则等待自动修复机制,但这种概率很小了 if update_result != 1: mysql_etl.update('file2mysql_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() mysql.commit() mysql_etl.close() mysql.close() return True except Exception as exc: logger.error('file2mysql error: %s' % exc) mysql_etl.rollback() mysql.rollback() datas = {'in_queue': 0, 'load_status': 0} mysql_etl.update('file2mysql_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() mysql_etl.close() mysql.close() raise self.retry(exc=exc, countdown=60)
def run_task(self, task_param): mysql = Custom_MySQL(using='hadoops2') mysql_etl = Custom_MySQL(using='etl_manage') redis = Custom_Redis(using='etl_task') mysql.begin() mysql_etl.begin() datas = {'status': 0} where = {'id': int(task_param['id'])} task_key_exc = "" try: hive = Custom_Hive(using='ares_dw') game = task_param['game'] platform = task_param['platform'] table_name = task_param['table_name'] log_date = task_param['log_date'] prefix_sql = task_param['prefix_sql'] exec_sql = task_param['exec_sql'] post_sql = task_param['post_sql'] date_cycle = task_param['date_cycle'] random_str = str(random.randint(0, 999999999)) stimes = str(int(time.time())) task_date = task_param['task_date'] task_name = task_param['task_name'] do_rate = task_param['do_rate'] #task_key_tmp = str(game)+str(platform)+str(task_name)+str(date_cycle)+str(do_rate)+str(log_date)+str(task_date) task_key_tmp = str(task_name)+str(date_cycle)+str(do_rate)+str(log_date)+str(task_date) task_key_md5 = hashlib.md5() task_key_md5.update(task_key_tmp) task_key_md5_result = task_key_md5.hexdigest() task_key = "dm2report_new_"+str(game)+"_"+str(platform)+"_"+str(task_key_md5_result) task_key_exc = task_key ''' celery 本身的机制存在漏洞,会将一个已经完成任务再次分配给其他的worker,致使同一个任务执行多次 为防止此种现象,在任务开始执行时,将任务的“唯一标示”写入redis中,标注已执行 ''' #如果task_key is None, 则表示该条任务没有执行过,正常执行即可 #如果task_key = 0, 则表示该条任务上次执行失败,允许重复执行 if redis.get(task_key) == "0" or redis.get(task_key) is None: tmp_file_dir = "/tmp/tmp/%s/%s/%s" % (game, platform, log_date) #创建本地目录 if not os.path.exists(tmp_file_dir): os.makedirs(tmp_file_dir) tmp_file = "%s/%s_%s_%s_%s.txt" % (tmp_file_dir, table_name, date_cycle, random_str, stimes) hql_conf = "SET hive.support.concurrency=false;" \ "SET hive.exec.compress.output=true;" \ "SET mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec; " ''' 将任务标示为开始执行:1 ''' datas['status'] = 1 datas['start_time'] = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) mysql_etl.update('dm2report_new_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() #在redis标注 任务正在执行 redis.set(task_key, 1) #执行前置sql,将数据临时写入本地,用于mysql load数据 if prefix_sql is not None: result = mysql.delete_by_sql(prefix_sql) logger.info('exec prefix_sql: delete old data {0}'.format(result['output'])) if result['status'] != 0: logger.error('Error Code %s : %s Cmd: %s' % (result['status'], result['output'], prefix_sql)) ''' 执行失败,将其状态标为未执行:0 ''' datas['status'] = 0 #在redis标注 任务未开始执行 redis.set(task_key, 0) else: ''' 将任务标示为删除临时文件完成:2 ''' datas['status'] = 2 datas.pop('start_time') mysql_etl.update('dm2report_new_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() ''' 开始执行hive ql,将数据dump到本地 ''' result = hive.dump(hql_conf+exec_sql, tmp_file) logger.info('exec exec_sql: dump data {0}'.format(result['output'])) if result['status'] != 0 or False == os.path.exists('%s' % tmp_file): logger.error('Error Code %s : %s Cmd: %s' % (result['status'], result['output'], exec_sql)) #在redis标注 任务未开始执行 redis.set(task_key, 0) else: ''' 将任务标示为dump hive数据完成:3 ''' datas['status'] = 3 datas['tmp_file_name'] = tmp_file mysql_etl.update('dm2report_new_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() #执行后置sql if post_sql is not None: post_sql = post_sql.replace('{dir_path}', tmp_file) post_sql = post_sql.replace('{table_name}', task_param['table_name']) post_sql = post_sql.replace('{db_name}', task_param['db_name']) result = mysql.load(post_sql) logger.info('exec post_sql: load data to hdfs {0}'.format(result['output'])) if result['status'] != 0: logger.error('Error Code %s : %s Cmd: %s' % (result['status'], result['output'], post_sql)) #在redis标注 任务未开始执行 redis.set(task_key, 0) else: ''' 将任务标识为录入mysql完成:4 ''' datas['status'] = 4 datas['end_time'] = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) datas.pop('tmp_file_name') #在redis标注 任务已成功执行 redis.set(task_key, 2) else: logger.error('prefix_sql is null') datas['status'] = 0 #在redis标注 任务未开始执行 redis.set(task_key, 0) #如果task_key=2, 则标示该条任务已经运行成功 elif redis.get(task_key) == "2": datas['status'] = 4 #该条任务正在运行中 else: return True ''' 将任务标示为:(模拟) 已从任务队列中移除 ''' datas['in_queue'] = 0 update_result = mysql_etl.update('dm2report_new_log', ' id = %(id)d' % where, **datas) # 如果数据库更新失败,再调用一次。 如果还是失败,等待自动修复机制,但这样的概率应该很小了。 if update_result != 1: mysql_etl.update('dm2report_new_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() mysql.commit() mysql_etl.close() mysql.close() return True except Exception as exc: logger.error('dm2report error: %s' % exc) mysql_etl.rollback() mysql.rollback() redis.set(task_key_exc, 0) datas = {'in_queue': 0, 'status': 0} mysql_etl.update('dm2report_new_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() mysql_etl.close() mysql.close() raise self.retry(exc=exc, countdown=60)
def run_task(self, task_param): mysql = Custom_MySQL(using='hadoops2') mysql_etl = Custom_MySQL(using='etl_manage') mysql.begin() mysql_etl.begin() where = {'id': int(task_param['id'])} try: hive = Custom_Hive(using='ares_dw') game = task_param['game'] platform = task_param['platform'] table_name = task_param['table_name'] log_date = task_param['log_date'] prefix_sql = task_param['prefix_sql'] exec_sql = task_param['exec_sql'] post_sql = task_param['post_sql'] date_cycle = task_param['date_cycle'] random_str = str(random.randint(0, 999999999)) stimes = str(int(time.time())) tmp_file_dir = "/tmp/tmp/%s/%s/%s" % (game, platform, log_date) #创建本地目录 if not os.path.exists(tmp_file_dir): os.makedirs(tmp_file_dir) tmp_file = "%s/%s_%s_%s_%s.txt" % (tmp_file_dir, table_name, date_cycle, random_str, stimes) hql_conf = "SET hive.support.concurrency=false;SET hive.exec.compress.output=true;" \ "SET mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec; " ''' 将任务标示为开始执行:1 ''' datas = {'status': 1, 'start_time': str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))} mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() #执行前置sql,将数据临时写入本地,用于mysql load数据 if prefix_sql is not None: result = mysql.delete_by_sql(prefix_sql) logger.info('exec prefix_sql: delete old data {0}'.format(result['output'])) if result['status'] == 0: ''' 将任务标示为删除临时文件完成:2 ''' datas = {'status': 2} mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() ''' 开始执行hive ql,将数据dump到本地 ''' result = hive.dump(hql_conf+exec_sql, tmp_file) logger.info('exec exec_sql: dump data {0}'.format(result['output'])) if result['status'] == 0 and True == os.path.exists('%s' % tmp_file): ''' 将任务标示为dump hive数据完成:3 ''' datas = {'status': 3, 'tmp_file_name': tmp_file} mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() #执行后置sql if post_sql is not None: post_sql = post_sql.replace('{dir_path}', tmp_file) post_sql = post_sql.replace('{table_name}', task_param['table_name']) post_sql = post_sql.replace('{db_name}', task_param['db_name']) result = mysql.load(post_sql) logger.info('exec post_sql: load data to hdfs {0}'.format(result['output'])) if result['status'] == 0: ''' 将任务标识为录入mysql完成:4 ''' datas = {'status': 4, 'end_time': str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))} mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) else: logger.error('Error Code %s : %s Cmd: %s' % (result['status'], result['output'], post_sql)) else: logger.error('Error Code %s : %s Cmd: %s' % (result['status'], result['output'], exec_sql)) else: logger.error('Error Code %s : %s Cmd: %s' % (result['status'], result['output'], prefix_sql)) ''' 执行失败,将其状态标为未执行:0 ''' datas = {'status': 0} mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) ''' 将任务标示为:(模拟) 已从任务队列中移除 ''' datas = {'in_queue': 0} update_result = mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) # 如果数据库更新失败,再调用一次。 如果还是失败,等待自动修复机制,但这样的概率应该很小了。 if update_result != 1: mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() mysql.commit() mysql_etl.close() mysql.close() return True except Exception as exc: logger.error('dm2report error: %s' % exc) mysql_etl.rollback() mysql.rollback() datas = {'in_queue': 0, 'status': 0} mysql_etl.update('dm2report_log', ' id = %(id)d' % where, **datas) mysql_etl.commit() mysql_etl.close() mysql.close() raise self.retry(exc=exc, countdown=60)