Esempio n. 1
0
 def _merge_atask_staged_to_queued(self):
     staged_active_tasks = self._get_staged_active_tasks()
     self.logger.debug(
         "_merge_atask_staged_to_queued: Before, staged_active_tasks(%s)" %
         CommonUtil.objectList2Str(staged_active_tasks))
     queued_active_tasks = self._get_queued_active_task()
     self.logger.debug(
         "_merge_atask_staged_to_queued: Before, queued_active_tasks(%s)" %
         CommonUtil.objectList2Str(queued_active_tasks))
     atask_dict = {}  # task_id: active_task
     for atask in queued_active_tasks:
         atask_dict[atask.id] = atask
     for sa_task in staged_active_tasks:
         if sa_task.id not in atask_dict:
             atask_dict[sa_task.id] = sa_task
         else:
             atask_dict[
                 sa_task.
                 id].table_stage_list = self._merge_staged_tasks_queued(
                     sa_task, atask_dict[sa_task.id])
     new_task_list = []
     for task_id in atask_dict:
         new_task_list.append(atask_dict[task_id])
     self.logger.debug(
         "_merge_atask_staged_to_queued: After, new_task_list(%s)" %
         CommonUtil.objectList2Str(new_task_list))
     return new_task_list
Esempio n. 2
0
 def table_stage_list(self):
     # unmarsh the table_stage_info into python object
     # return StageToProcess list
     if self.table_stage_info:
         return CommonUtil.decodeTableStageInfo(self.table_stage_info)
     else:
         return []
Esempio n. 3
0
 def triggle_cond_list(self):
     # unmarsh the triggle_tables into python object
     # return TriggleCond list
     if self.triggle_tables:
         return CommonUtil.decodeTriggleConds(self.triggle_tables)
     else:
         return []
Esempio n. 4
0
 def time_to_process(self):
     if self.type < 2:
         return True
     cur_datetime = CommonUtil.getCurrentDatetime()
     cur_year = cur_datetime.year
     cur_day_in_week = cur_datetime.weekday()
     cur_month = cur_datetime.month
     cur_day_in_month = cur_datetime.day
     cur_hour = cur_datetime.hour
     cur_minute = cur_datetime.minute
     if self.update_time and self.update_time.minute == cur_minute and self.update_time.hour == cur_hour and \
         self.update_time.day == cur_day_in_month and self.update_time.month == cur_month and \
         self.update_time.weekday() == cur_day_in_week:
         return False
     self.logger.debug("task schedule_cron is: %s, current time is:{minutes(%s), hour(%s),"
                     " day_in_month(%s), month(%s), day_in_week(%s)}" % \
                     (self.schedule_cron, cur_minute, cur_hour, cur_day_in_month, cur_month, cur_day_in_week))
     if cur_year in self.schedule_cron.years and \
         cur_day_in_week in self.schedule_cron.days_in_week and \
         cur_month in self.schedule_cron.months and \
         cur_day_in_month in self.schedule_cron.days_in_month and \
         cur_hour in self.schedule_cron.hours and \
         cur_minute in self.schedule_cron.minutes:
         self.update_time = cur_datetime
         self.logger.info("enqueue the time sheduled task(task_id=%s)" % self.id)
         return True
     return False
Esempio n. 5
0
    def saveDataFrame(self, df, db_name, table_name, is_full):
        if len(sys.argv) < 2:
            raise Exception(
                "Export destination path should be given as an argument")
        export_dir_path = sys.argv[1]
        export_data_path, export_schema_path = CommonUtil.getExportPath(
            db_name, table_name, is_full, export_dir_path, "csv")
        try:
            df.write.csv(export_data_path)
        except:
            self.logger.error("Write csv %s Error" % export_data_path)
            raise

        destSchema = {}
        destSchema["db_type"] = "derived"
        destSchema["db_version"] = None
        destSchema["db_name"] = db_name
        destSchema["table_name"] = table_name
        destSchema["schema"] = df.schema.jsonValue()["fields"]
        tmp_schema_path = "/tmp/" + db_name + "_" + table_name + ".json"
        localTempFile = open(tmp_schema_path, mode="w")
        localTempFile.write(json.dumps(destSchema))
        localTempFile.close()
        hUtil = HDFSUtil()
        hUtil.upload2HDFS(tmp_schema_path, export_schema_path)
Esempio n. 6
0
 def set_task_history_status(self, task_history, result_status, result):
     cur_time = CommonUtil.getCurrentDatetime()
     task_history.update_time = cur_time
     task_history.result_status = result_status
     task_history.result = result
     if result_status == 1:
         task_history.end_time = cur_time
     sess.commit()
     sess.flush()
Esempio n. 7
0
 def set_task_processed(self, queued_id):
     queued_task = sess.query(TaskQueue).get(queued_id)
     if queued_task:
         queued_task.has_processed = 1
         queued_task.end_time = CommonUtil.getCurrentDatetime().replace(tzinfo=None)
         sess.commit()
         sess.flush()
     else:
         raise ValueError("Can not find the queued_id(" + queued_id + ") from task queue")
Esempio n. 8
0
    def update_queue_tb(self):
        new_queued_tasks = self._merge_atask_staged_to_queued()
        # Get task from tb_task_queue
        for active_task in new_queued_tasks:
            self.logger.info("Begin to update task(id=%s) in tb_task_queue" %
                             active_task.id)
            # sq = sess.query(TaskQueue).filter(TaskQueue.mr_task_id == active_task.id, TaskQueue.has_processed == 0)
            # queued_task = sq.first()
            if active_task.table_stage_list:
                self.logger.debug("Insert ActiveTask(%s) to queue" %
                                  active_task)
                encoded_table_stage = CommonUtil.encodeTableStage(
                    active_task.table_stage_list)
                tq = TaskQueue(mr_task_id=active_task.id,
                               table_stage_info=encoded_table_stage)
                tq.create_time = CommonUtil.getCurrentDatetime()
                sess.add(tq)
# queued_task.has_processed = 0
        sess.commit()
        sess.flush()
        self._update_stage_to_processed()
Esempio n. 9
0
 def createIndexFromDF(self, df):
     mp_dict = {}
     property_dict = {}
     mp_dict["mappings"] = {}
     mp_dict["mappings"][self.type] = {}
     mp_dict["mappings"][self.type]["properties"] = property_dict
     for col in df.schema:
         f_dict = {}
         property_dict[col.name] = f_dict
         f_dict["type"] = CommonUtil.getESType(col.dataType.typeName())
     self.logger.info("Put es mapping(%s)" % mp_dict)
     #self.es.indices.put_mapping(self.type, body=mp_dict, index=self.index)
     #self.es.create(self.index, self.type, self.id_field, mp_dict)
     self.es.indices.create(self.index, mp_dict)
Esempio n. 10
0
 def dequeue_task(self):
     sq = sess.query(TaskQueue).order_by(TaskQueue.id).filter(TaskQueue.has_processed == 0)
     task = sq.first()
     res = None
     if task:
         sq2 = sess.query(MRTask).filter(MRTask.id == task.mr_task_id)
         mr_task = sq2.first()
         atask = ActiveTask(mr_task)
         atask.table_stage_list = task.table_stage_list
         atask.begin_time = CommonUtil.getCurrentDatetime()
         res = (atask, task.id)
     else:
         res = (None,None)
     sess.commit()
     sess.flush()
     return res
Esempio n. 11
0
 def enqueue_time_task(self):
     self.logger.info("Begin to enque_time_task")
     sq_task = sess.query(MRTask).filter(MRTask.type > 1)
     for s in sq_task:
         #sq2_queue = sess.query(TaskQueue).filter(TaskQueue.mr_task_id == s.id)
         #qtask = sq2_queue.first()
         # time task not in task queue, we should enqueue the task
         if s.time_to_process():
             tq = TaskQueue(mr_task_id=s.id)
             tq.create_time = CommonUtil.getCurrentDatetime().replace(
                 tzinfo=None)
             #s.update_time == tq.create_time
             tq.has_processed = 0
             sess.add(tq)
     sess.commit()
     sess.flush()
Esempio n. 12
0
 def should_process(self):
     if self.type < 2:
         return True
     cur_datetime = CommonUtil.getCurrentDatetime()
     cur_year = cur_datetime.year
     cur_day_in_week = cur_datetime.weekday()
     cur_month = cur_datetime.month
     cur_day_in_month = cur_datetime.day
     cur_hour = cur_datetime.hour
     cur_minute = cur_datetime.minute
     self.logger.debug("task schedule_cron is: %s, current time is:{minutes(%s), hour(%s),"
                     " day_in_month(%s), month(%s), day_in_week(%s)}" % \
                     (self.schedule_cron, cur_minute, cur_hour, cur_day_in_month, cur_month, cur_day_in_week))
     if cur_year in self.schedule_cron.years and \
         cur_day_in_week in self.schedule_cron.days_in_week and \
         cur_month in self.schedule_cron.months and \
         cur_day_in_month in self.schedule_cron.days_in_month and \
         cur_hour in self.schedule_cron.hours and \
         cur_minute in self.schedule_cron.minutes:
         return True
     return False
Esempio n. 13
0
# -*- coding: utf-8 -*-:
import time
from common.util.util import CommonUtil
from taskqueuescan.task_queue_scan import TaskQueueScan
from common.config.config import task_queue_scan_interval

if __name__ == "__main__":
    while True:
        tqc = TaskQueueScan()
        atask, queued_id = tqc.dequeue_task()
        if atask:
            atask.begin_time = CommonUtil.getCurrentDatetime()
            task_history = tqc.move_task_to_history(atask)
            tqc.process_task(atask, task_history)
            atask.end_time = CommonUtil.getCurrentDatetime()
            #tqc.move_task_to_history(atask, r)
            #tqc.set_task_processed(queued_id)
            tqc.delete_queued_task(queued_id)

        time.sleep(task_queue_scan_interval)
Esempio n. 14
0
import os, re, shutil, json, time
import merge, common.config.config as config
import create_dir
import trigger_servers
from sqlalchemy import desc
import sys
import common.util.schema_paser as schema_paser
import traceback
from common.util.util import CommonUtil as util

reload(sys)
sys.setdefaultencoding("utf-8")

setting = None

env = util.getParam("env")

if env == "pro":
    setting = config.pro_path
else:
    setting = config.dev_path

# 系统配置的前缀
prefix = setting.get("prefix")

# 存放parquet文件的地址
parquet_path = setting.get("parquet_path")

pattern = re.compile(r'^\d{8}_\d{2}_\d{2}_\d{2}$')

Esempio n. 15
0
import time
import common.config.config as common_config
import common.dao.table_schema as tb_table_schema
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from hdfs import *
import common.dao.import_tables as import_tables
import common.db.db_config as db
from common.util.util import CommandExecutor
from common.util.util import CommonUtil as common_util
from sqlalchemy import desc
import json

setting = None

env = common_util.getParam("env")

if env == "pro":
    setting = common_config.pro_path
else:
    setting = common_config.dev_path

# master 的地址
spark_master_ip = setting.get("spark_master_ip")

# warehouse的位置
spark_warehouse = setting.get("spark_warehouse")

# parquet文件的隐藏列
hidden_colum = "resvd_stage_id,resvd_flag,resvd_create_time,resvd_latest_update_time"
Esempio n. 16
0
 def export_table_list(self):
     # get tables into table list
     if self.export_tables:
         return CommonUtil.splitString(self.export_tables, ",")
     else:
         return []
Esempio n. 17
0
    def submit(self, active_task, task_history):
        if TASK_TYPE[active_task.type] == "HIVE" or TASK_TYPE[
                active_task.type] == "TIME_HIVE":
            hiveUtil = HiveUtil()
            #for tab in active_task.export_table_list:
            #hiveUtil.dropTable(tab)
            #print '--->not drop table<---'
            hiveconf_params = []
            if active_task.hive_params_list:
                for param in active_task.hive_params_list:
                    hiveconf_params.append("-hiveconf")
                    hiveconf_params.append(param)
            try:
                cmd_exec = CommandExecutor(self.hive_submit_bin, "-f",
                                           active_task.bin_file_uri,
                                           *hiveconf_params)
                cmd_exec.execute()
            except Exception as e:
                res = str(cmd_exec) + "Error: " + str(e)
                self.set_task_history_status(task_history, 2, res)
                raise e

            self.logger.debug("Export export_table_list: (%s)",
                              active_task.export_table_list)
            for tab in active_task.export_table_list:
                # export hive table (data & schema) to hdfs similar to spark task
                #export_sub_dir = active_task.export_dir_uri + "/" + active_task.db_name + "." + tab
                db_name, table_name = CommonUtil.splitDBAndTable(tab)
                try:
                    hiveUtil.exportTable(db_name, table_name, True,
                                         active_task.export_dir_uri)
                except Exception as e:
                    res = "exportTables Error: " + str(e)
                    self.set_task_history_status(task_history, 2, res)
                    raise e

        elif TASK_TYPE[active_task.type] == "SPARK" or TASK_TYPE[
                active_task.type] == "TIME_SPARK":
            hdfsUtil = HDFSUtil()
            local_bin_file = "/tmp/" + CommonUtil.getPathFlat(
                active_task.bin_file_uri)
            hdfsUtil.downloadFileFromHDFS(local_bin_file,
                                          active_task.bin_file_uri)
            try:
                cmd_exec = CommandExecutor(self.spark_submit_bin,
                                           local_bin_file,
                                           active_task.export_dir_uri)
                cmd_exec.execute()
            except Exception as e:
                res = cmd_exec + " Error: " + str(e)
                self.set_task_history_status(task_history, 2, res)
                raise e
            if os.path.exists(local_bin_file):
                CommonUtil.removeLocalFile(local_bin_file)
        if active_task.has_derivative_table:
            hdfsUtil = HDFSUtil()
            tables_in_file = []
            for tab in active_task.export_table_list:
                db_tb = CommonUtil.splitDBAndTable(tab)
                d, t = db_tb[0], db_tb[1]
                tables_in_file.append(d + "--" + t)
            try:
                export_files = hdfsUtil.extractFilesFromDir(
                    active_task.export_dir_uri, *tables_in_file)
                self.logger.debug("Files generated in (%s) are (%s)" %
                                  (active_task.export_dir_uri, export_files))
                #data_files = hdfsUtil.getFilesBySuffix(export_files, ".parquet")
                data_files = hdfsUtil.getFilesBySuffix(export_files, ".csv")
                data_files.sort()
                self.logger.debug("Data files are (%s)" % data_files)
                schema_files = hdfsUtil.getFilesBySuffix(
                    export_files, "schema.sql")
                schema_files.sort()
                self.logger.debug("Schema files are (%s)" % schema_files)
            except Exception as e:
                res = "Extract exported files Error: " + str(e)
                self.set_task_history_status(task_history, 2, res)
                raise e
            for data_file in data_files:
                # data_file = data_files[i]
                try:
                    db_name, tb_name, timestamp, is_full = hdfsUtil.getExportProperties(
                        data_file)
                    schema_file = CommonUtil.getFileFromTimestamp(
                        schema_files, timestamp)
                    if is_full:
                        full_str = "full"
                    else:
                        full_str = "incremental"
                    cur_time = datetime.datetime.now()
                    date_str = CommonUtil.paddingTimeNum(cur_time.year) + CommonUtil.paddingTimeNum(cur_time.month) + CommonUtil.paddingTimeNum(cur_time.day) + "_" \
                               + CommonUtil.paddingTimeNum(cur_time.hour) + "_" + CommonUtil.paddingTimeNum(cur_time.minute) + "_" + CommonUtil.paddingTimeNum(cur_time.second)
                    dest_dir = config.pro_path[
                        "prefix"] + "/" + db_name + "/" + tb_name + "/" + full_str + "/" + date_str
                    if is_full:
                        data_file_dest = dest_dir + "/" + "data_full.csv"
                    else:
                        data_file_dest = dest_dir + "/" + "data_incremental.csv"
                    #data_file_dest = dest_dir  + "/" + "data_full.parquet"
                    schema_file_dest = dest_dir + "/" + "schema.sql"
                    hdfsUtil.copyMRResults2Local(data_file, data_file_dest)
                    hdfsUtil.copyMRResults2Local(schema_file, schema_file_dest)

                    #CommonUtil.convertCSV2PipeDelimited(data_file_dest)
                    done_indicator_file = dest_dir + "/" + "upload_completed"
                    CommonUtil.touch(done_indicator_file)

                    hdfsUtil.deleteFileFromHDFS(data_file)
                    hdfsUtil.deleteFileFromHDFS(schema_file)
                except Exception as e:
                    res = "Copying export files into input files Error: " + str(
                        e)
                    self.set_task_history_status(task_history, 2, res)
                    raise e
        self.set_task_history_status(task_history, 1, "Success")
Esempio n. 18
0
 def get_stage_info(self):
     return CommonUtil.encodeTableStage(self.table_stage_list)
Esempio n. 19
0
 def hive_params_list(self):
     if self.hive_params:
         hive_params_list = CommonUtil.convertParam2List(self.hive_params)
         return hive_params_list
     else:
         return None