def _merge_atask_staged_to_queued(self): staged_active_tasks = self._get_staged_active_tasks() self.logger.debug( "_merge_atask_staged_to_queued: Before, staged_active_tasks(%s)" % CommonUtil.objectList2Str(staged_active_tasks)) queued_active_tasks = self._get_queued_active_task() self.logger.debug( "_merge_atask_staged_to_queued: Before, queued_active_tasks(%s)" % CommonUtil.objectList2Str(queued_active_tasks)) atask_dict = {} # task_id: active_task for atask in queued_active_tasks: atask_dict[atask.id] = atask for sa_task in staged_active_tasks: if sa_task.id not in atask_dict: atask_dict[sa_task.id] = sa_task else: atask_dict[ sa_task. id].table_stage_list = self._merge_staged_tasks_queued( sa_task, atask_dict[sa_task.id]) new_task_list = [] for task_id in atask_dict: new_task_list.append(atask_dict[task_id]) self.logger.debug( "_merge_atask_staged_to_queued: After, new_task_list(%s)" % CommonUtil.objectList2Str(new_task_list)) return new_task_list
def table_stage_list(self): # unmarsh the table_stage_info into python object # return StageToProcess list if self.table_stage_info: return CommonUtil.decodeTableStageInfo(self.table_stage_info) else: return []
def triggle_cond_list(self): # unmarsh the triggle_tables into python object # return TriggleCond list if self.triggle_tables: return CommonUtil.decodeTriggleConds(self.triggle_tables) else: return []
def time_to_process(self): if self.type < 2: return True cur_datetime = CommonUtil.getCurrentDatetime() cur_year = cur_datetime.year cur_day_in_week = cur_datetime.weekday() cur_month = cur_datetime.month cur_day_in_month = cur_datetime.day cur_hour = cur_datetime.hour cur_minute = cur_datetime.minute if self.update_time and self.update_time.minute == cur_minute and self.update_time.hour == cur_hour and \ self.update_time.day == cur_day_in_month and self.update_time.month == cur_month and \ self.update_time.weekday() == cur_day_in_week: return False self.logger.debug("task schedule_cron is: %s, current time is:{minutes(%s), hour(%s)," " day_in_month(%s), month(%s), day_in_week(%s)}" % \ (self.schedule_cron, cur_minute, cur_hour, cur_day_in_month, cur_month, cur_day_in_week)) if cur_year in self.schedule_cron.years and \ cur_day_in_week in self.schedule_cron.days_in_week and \ cur_month in self.schedule_cron.months and \ cur_day_in_month in self.schedule_cron.days_in_month and \ cur_hour in self.schedule_cron.hours and \ cur_minute in self.schedule_cron.minutes: self.update_time = cur_datetime self.logger.info("enqueue the time sheduled task(task_id=%s)" % self.id) return True return False
def saveDataFrame(self, df, db_name, table_name, is_full): if len(sys.argv) < 2: raise Exception( "Export destination path should be given as an argument") export_dir_path = sys.argv[1] export_data_path, export_schema_path = CommonUtil.getExportPath( db_name, table_name, is_full, export_dir_path, "csv") try: df.write.csv(export_data_path) except: self.logger.error("Write csv %s Error" % export_data_path) raise destSchema = {} destSchema["db_type"] = "derived" destSchema["db_version"] = None destSchema["db_name"] = db_name destSchema["table_name"] = table_name destSchema["schema"] = df.schema.jsonValue()["fields"] tmp_schema_path = "/tmp/" + db_name + "_" + table_name + ".json" localTempFile = open(tmp_schema_path, mode="w") localTempFile.write(json.dumps(destSchema)) localTempFile.close() hUtil = HDFSUtil() hUtil.upload2HDFS(tmp_schema_path, export_schema_path)
def set_task_history_status(self, task_history, result_status, result): cur_time = CommonUtil.getCurrentDatetime() task_history.update_time = cur_time task_history.result_status = result_status task_history.result = result if result_status == 1: task_history.end_time = cur_time sess.commit() sess.flush()
def set_task_processed(self, queued_id): queued_task = sess.query(TaskQueue).get(queued_id) if queued_task: queued_task.has_processed = 1 queued_task.end_time = CommonUtil.getCurrentDatetime().replace(tzinfo=None) sess.commit() sess.flush() else: raise ValueError("Can not find the queued_id(" + queued_id + ") from task queue")
def update_queue_tb(self): new_queued_tasks = self._merge_atask_staged_to_queued() # Get task from tb_task_queue for active_task in new_queued_tasks: self.logger.info("Begin to update task(id=%s) in tb_task_queue" % active_task.id) # sq = sess.query(TaskQueue).filter(TaskQueue.mr_task_id == active_task.id, TaskQueue.has_processed == 0) # queued_task = sq.first() if active_task.table_stage_list: self.logger.debug("Insert ActiveTask(%s) to queue" % active_task) encoded_table_stage = CommonUtil.encodeTableStage( active_task.table_stage_list) tq = TaskQueue(mr_task_id=active_task.id, table_stage_info=encoded_table_stage) tq.create_time = CommonUtil.getCurrentDatetime() sess.add(tq) # queued_task.has_processed = 0 sess.commit() sess.flush() self._update_stage_to_processed()
def createIndexFromDF(self, df): mp_dict = {} property_dict = {} mp_dict["mappings"] = {} mp_dict["mappings"][self.type] = {} mp_dict["mappings"][self.type]["properties"] = property_dict for col in df.schema: f_dict = {} property_dict[col.name] = f_dict f_dict["type"] = CommonUtil.getESType(col.dataType.typeName()) self.logger.info("Put es mapping(%s)" % mp_dict) #self.es.indices.put_mapping(self.type, body=mp_dict, index=self.index) #self.es.create(self.index, self.type, self.id_field, mp_dict) self.es.indices.create(self.index, mp_dict)
def dequeue_task(self): sq = sess.query(TaskQueue).order_by(TaskQueue.id).filter(TaskQueue.has_processed == 0) task = sq.first() res = None if task: sq2 = sess.query(MRTask).filter(MRTask.id == task.mr_task_id) mr_task = sq2.first() atask = ActiveTask(mr_task) atask.table_stage_list = task.table_stage_list atask.begin_time = CommonUtil.getCurrentDatetime() res = (atask, task.id) else: res = (None,None) sess.commit() sess.flush() return res
def enqueue_time_task(self): self.logger.info("Begin to enque_time_task") sq_task = sess.query(MRTask).filter(MRTask.type > 1) for s in sq_task: #sq2_queue = sess.query(TaskQueue).filter(TaskQueue.mr_task_id == s.id) #qtask = sq2_queue.first() # time task not in task queue, we should enqueue the task if s.time_to_process(): tq = TaskQueue(mr_task_id=s.id) tq.create_time = CommonUtil.getCurrentDatetime().replace( tzinfo=None) #s.update_time == tq.create_time tq.has_processed = 0 sess.add(tq) sess.commit() sess.flush()
def should_process(self): if self.type < 2: return True cur_datetime = CommonUtil.getCurrentDatetime() cur_year = cur_datetime.year cur_day_in_week = cur_datetime.weekday() cur_month = cur_datetime.month cur_day_in_month = cur_datetime.day cur_hour = cur_datetime.hour cur_minute = cur_datetime.minute self.logger.debug("task schedule_cron is: %s, current time is:{minutes(%s), hour(%s)," " day_in_month(%s), month(%s), day_in_week(%s)}" % \ (self.schedule_cron, cur_minute, cur_hour, cur_day_in_month, cur_month, cur_day_in_week)) if cur_year in self.schedule_cron.years and \ cur_day_in_week in self.schedule_cron.days_in_week and \ cur_month in self.schedule_cron.months and \ cur_day_in_month in self.schedule_cron.days_in_month and \ cur_hour in self.schedule_cron.hours and \ cur_minute in self.schedule_cron.minutes: return True return False
# -*- coding: utf-8 -*-: import time from common.util.util import CommonUtil from taskqueuescan.task_queue_scan import TaskQueueScan from common.config.config import task_queue_scan_interval if __name__ == "__main__": while True: tqc = TaskQueueScan() atask, queued_id = tqc.dequeue_task() if atask: atask.begin_time = CommonUtil.getCurrentDatetime() task_history = tqc.move_task_to_history(atask) tqc.process_task(atask, task_history) atask.end_time = CommonUtil.getCurrentDatetime() #tqc.move_task_to_history(atask, r) #tqc.set_task_processed(queued_id) tqc.delete_queued_task(queued_id) time.sleep(task_queue_scan_interval)
import os, re, shutil, json, time import merge, common.config.config as config import create_dir import trigger_servers from sqlalchemy import desc import sys import common.util.schema_paser as schema_paser import traceback from common.util.util import CommonUtil as util reload(sys) sys.setdefaultencoding("utf-8") setting = None env = util.getParam("env") if env == "pro": setting = config.pro_path else: setting = config.dev_path # 系统配置的前缀 prefix = setting.get("prefix") # 存放parquet文件的地址 parquet_path = setting.get("parquet_path") pattern = re.compile(r'^\d{8}_\d{2}_\d{2}_\d{2}$')
import time import common.config.config as common_config import common.dao.table_schema as tb_table_schema from pyspark.sql import SparkSession from pyspark.sql.types import * from hdfs import * import common.dao.import_tables as import_tables import common.db.db_config as db from common.util.util import CommandExecutor from common.util.util import CommonUtil as common_util from sqlalchemy import desc import json setting = None env = common_util.getParam("env") if env == "pro": setting = common_config.pro_path else: setting = common_config.dev_path # master 的地址 spark_master_ip = setting.get("spark_master_ip") # warehouse的位置 spark_warehouse = setting.get("spark_warehouse") # parquet文件的隐藏列 hidden_colum = "resvd_stage_id,resvd_flag,resvd_create_time,resvd_latest_update_time"
def export_table_list(self): # get tables into table list if self.export_tables: return CommonUtil.splitString(self.export_tables, ",") else: return []
def submit(self, active_task, task_history): if TASK_TYPE[active_task.type] == "HIVE" or TASK_TYPE[ active_task.type] == "TIME_HIVE": hiveUtil = HiveUtil() #for tab in active_task.export_table_list: #hiveUtil.dropTable(tab) #print '--->not drop table<---' hiveconf_params = [] if active_task.hive_params_list: for param in active_task.hive_params_list: hiveconf_params.append("-hiveconf") hiveconf_params.append(param) try: cmd_exec = CommandExecutor(self.hive_submit_bin, "-f", active_task.bin_file_uri, *hiveconf_params) cmd_exec.execute() except Exception as e: res = str(cmd_exec) + "Error: " + str(e) self.set_task_history_status(task_history, 2, res) raise e self.logger.debug("Export export_table_list: (%s)", active_task.export_table_list) for tab in active_task.export_table_list: # export hive table (data & schema) to hdfs similar to spark task #export_sub_dir = active_task.export_dir_uri + "/" + active_task.db_name + "." + tab db_name, table_name = CommonUtil.splitDBAndTable(tab) try: hiveUtil.exportTable(db_name, table_name, True, active_task.export_dir_uri) except Exception as e: res = "exportTables Error: " + str(e) self.set_task_history_status(task_history, 2, res) raise e elif TASK_TYPE[active_task.type] == "SPARK" or TASK_TYPE[ active_task.type] == "TIME_SPARK": hdfsUtil = HDFSUtil() local_bin_file = "/tmp/" + CommonUtil.getPathFlat( active_task.bin_file_uri) hdfsUtil.downloadFileFromHDFS(local_bin_file, active_task.bin_file_uri) try: cmd_exec = CommandExecutor(self.spark_submit_bin, local_bin_file, active_task.export_dir_uri) cmd_exec.execute() except Exception as e: res = cmd_exec + " Error: " + str(e) self.set_task_history_status(task_history, 2, res) raise e if os.path.exists(local_bin_file): CommonUtil.removeLocalFile(local_bin_file) if active_task.has_derivative_table: hdfsUtil = HDFSUtil() tables_in_file = [] for tab in active_task.export_table_list: db_tb = CommonUtil.splitDBAndTable(tab) d, t = db_tb[0], db_tb[1] tables_in_file.append(d + "--" + t) try: export_files = hdfsUtil.extractFilesFromDir( active_task.export_dir_uri, *tables_in_file) self.logger.debug("Files generated in (%s) are (%s)" % (active_task.export_dir_uri, export_files)) #data_files = hdfsUtil.getFilesBySuffix(export_files, ".parquet") data_files = hdfsUtil.getFilesBySuffix(export_files, ".csv") data_files.sort() self.logger.debug("Data files are (%s)" % data_files) schema_files = hdfsUtil.getFilesBySuffix( export_files, "schema.sql") schema_files.sort() self.logger.debug("Schema files are (%s)" % schema_files) except Exception as e: res = "Extract exported files Error: " + str(e) self.set_task_history_status(task_history, 2, res) raise e for data_file in data_files: # data_file = data_files[i] try: db_name, tb_name, timestamp, is_full = hdfsUtil.getExportProperties( data_file) schema_file = CommonUtil.getFileFromTimestamp( schema_files, timestamp) if is_full: full_str = "full" else: full_str = "incremental" cur_time = datetime.datetime.now() date_str = CommonUtil.paddingTimeNum(cur_time.year) + CommonUtil.paddingTimeNum(cur_time.month) + CommonUtil.paddingTimeNum(cur_time.day) + "_" \ + CommonUtil.paddingTimeNum(cur_time.hour) + "_" + CommonUtil.paddingTimeNum(cur_time.minute) + "_" + CommonUtil.paddingTimeNum(cur_time.second) dest_dir = config.pro_path[ "prefix"] + "/" + db_name + "/" + tb_name + "/" + full_str + "/" + date_str if is_full: data_file_dest = dest_dir + "/" + "data_full.csv" else: data_file_dest = dest_dir + "/" + "data_incremental.csv" #data_file_dest = dest_dir + "/" + "data_full.parquet" schema_file_dest = dest_dir + "/" + "schema.sql" hdfsUtil.copyMRResults2Local(data_file, data_file_dest) hdfsUtil.copyMRResults2Local(schema_file, schema_file_dest) #CommonUtil.convertCSV2PipeDelimited(data_file_dest) done_indicator_file = dest_dir + "/" + "upload_completed" CommonUtil.touch(done_indicator_file) hdfsUtil.deleteFileFromHDFS(data_file) hdfsUtil.deleteFileFromHDFS(schema_file) except Exception as e: res = "Copying export files into input files Error: " + str( e) self.set_task_history_status(task_history, 2, res) raise e self.set_task_history_status(task_history, 1, "Success")
def get_stage_info(self): return CommonUtil.encodeTableStage(self.table_stage_list)
def hive_params_list(self): if self.hive_params: hive_params_list = CommonUtil.convertParam2List(self.hive_params) return hive_params_list else: return None