def load_file(date_arg): #导文件模块,主要考虑:要导的文件存在不存在?需要有删除计算当天的分区步骤,文件没导成功怎么办 global WORKERS, FILE_HIVE_LIST, HIVE_DB, HIVE_TABLE # 5.1 设置全局变量 hive_status = check_hive_file() # 5.2 hive_status=True、False 检查是否存在文件 if not hive_status: log_str = 'click stream file parse fail: file count not equal WORKERS' print MyTime.get_local_time(), '-------------- ' + log_str log.error(log_str) #MyAlarm.send_mail_sms(log_str) # 5.3 假如文件不存在,生成告警日志,并发送文件 else: hive = MyHiveBin.HiveBin() # 调用hive模块 模块在 com/hive/bin dt = date_arg.replace('-', '') # "2016-11-30" 改变为 "20161130" hive.drop_partition( HIVE_DB, HIVE_TABLE, 'dt', dt ) # 5.4 假如文件存在,删掉计算当天的分区 HIVE_DB= 'to8to_rawdata' HIVE_TABLE='clickstream' partition_dict = {'dt': dt} for hive_file in FILE_HIVE_LIST: log_str = 'load file ' + hive_file + ' into hive begin' print MyTime.get_local_time(), '-------------- ' + log_str log.info(log_str) status = hive.load_file( HIVE_DB, HIVE_TABLE, hive_file, partition_dict) # status 注意,导数据成功后将 状态 True、False 赋值给 status if status is False: log_str = 'Load file ' + hive_file + ' into hive status:fail; Click stream parse exit' log.error(log_str) #MyAlarm.send_mail_sms(log_str) # 5.5 假如文件存在 但没导成功,将发送告警 return False
def load_file(date, today): global tar_src global tar_des global latest_time tar_src = tar_src.replace('xxxx-xx-xx', date) tar_des = tar_des.replace('xxxx-xx-xx', date) print tar_src latest_time_stamp = MyTime.datetime_timestamp( latest_time.replace('xxxx-xx-xx', today)) if not os.path.exists(tar_des): shell = "mkdir -p " + tar_des os.system(shell) while 1: if os.path.exists(tar_src): if file_modify_stat(tar_src): if MyTool.tar_file(tar_src, tar_des): for root, dirs, files in os.walk(tar_des): for d in dirs: print os.path.join(root, d) for f in files: file = os.path.join(root, f) shell = '/usr/bin/dos2unix ' + file os.popen(shell) hive = MyHiveBin.HiveBin() hive.load_file_single_overwrite( 'to8to_rawdata', file) log.info('Mysql actual kefu yuyue to hive status:ok') break else: log.info('Mysql actual tar file not exists') now_time_stamp = MyTime.datetime_timestamp(MyTime.get_local_time()) if now_time_stamp > latest_time_stamp: log.critical('not find ' + tar_src) MyAlarm.send_mail_sms('Get Mysql actual tar file status:fail!') return False time.sleep(time_rate) return True
def __init__(self, date): date_before = MyTime.date_before(date) #如果date是昨天的日期,那么 date_before=1 self.dbName = 'to8to_rawdata' self.lastDate = datetime.date.today() - datetime.timedelta( days=date_before ) # self.lastDate = datetime.date(2016, 12, 19) str(self.lastDate)='2016-12-19' print "load mysql all file begin--------------------", self.lastDate self.last2Date = datetime.date.today() - datetime.timedelta( days=date_before + 1) self.last7Date = datetime.date.today() - datetime.timedelta( days=date_before + 10) self.last7DateFormat = str(self.last7Date).replace('-', '') self.lastDateFormat = str(self.lastDate).replace('-', '') self.data_src = src_path + "/tmp/bi/Mysql/" # /data1/bi/platform/rawdata/1011/ self.data_des = src_path + "/tar/" self.txtPath = self.data_des + str( self.lastDate) + os.sep + 'Mysql' + os.sep self.doPath = self.txtPath self.delPath = self.data_des + str( self.last2Date) + os.sep + 'Mysql' + os.sep #定义要加密(替换为空)的txt文件的列 self.cutCol = { "to8to_fcom": [9, 10, 11, 12, 13], "to8to_jj_smt_zb": [6, 8, 26], "to8to_yuyue_apply": [18, 42], "to8to_fcom_info": [6, 23, 24], "to8to_to8toyw_contact": [7, 8], "to8to_to8toyw_back": [4, 5, 6], "to8to_mcom": [13, 14, 15, 27], "to8to_tuori": [15, 16, 20, 21], #"to8to_huodong_yanfang" : [2,3], "to8to_huodong_apply": [2], "to8to_gongdi": [13] } #定义要解码url的字段 self.unquote = {"to8to_project_src": [3]} #定义保留分区的表 self.reserve = ('to8to_fcom', 'to8to_yuyue_apply_fp', 'to8to_zxb', 'to8to_yuyue_apply', 'to8to_dw_login_record', 'to8to_yuyue_apply_record', 'to8to_nps_record', 'to8to_item_condition', 'ts_record', 'to8to_jj_jianli_clock', 'to8to_yuyue_yyhh_shkf', 'to8to_apply_config_node', 'to8to_apply_config', 'to8to_credit_log', 'sem_360_diyu')
(status, output) = commands.getstatusoutput(shell_command) if status == 0: print time.strftime("%H:%M:%S", time.localtime( )), "==========================================>", log_str, "ok" log.info(log_str + "ok") return True else: print time.strftime("%H:%M:%S", time.localtime( )), "==========================================>", log_str, "fail" log.info(log_str + "fail") return False if __name__ == '__main__': date_day = MyTime.get_date(1) if len(sys.argv) == 2: date_day = sys.argv[1] print time.strftime("%H:%M:%S", time.localtime( )), "==========================================>un tar begin" if un_tar(date_day): print time.strftime("%H:%M:%S", time.localtime( )), "==========================================>load to mongodb begin" if tar_to_mongodb(date_day): print time.strftime( "%H:%M:%S", time.localtime() ), "==========================================>load to mongodb end" if mongodb_to_mysql(date_day): time.sleep(10) print time.strftime(
log.debug(logString) #MyAlarm.send_mail_sms(logString) #删除分区 try: if table not in self.reserve: dropData = 'use to8to_rawdata;alter table ' + table + ' drop partition (dt=' + self.last7DateFormat + ')' # self.last7DateFormat = str(self.last7Date).replace('-', '') 删除7天前的分区 hive.execute(dropData) #删除掉当天txt文件 #os.remove(file) except Exception, ex: pass try: rmCmd = 'rm -rf ' + self.delPath # 删除当天的分区 self.doPath = '/data1/bi/platform/tar/2016-12-21/Mysql/' os.system(rmCmd) except Exception, ex: pass if __name__ == '__main__': date = MyTime.get_date(1) #自动取昨天日期 if len(sys.argv) == 2: date = sys.argv[1] hive = toHive(date) if hive.tarFile(): #解压tar.gz文件 hive.cutTxt() #替换隐私信息为null hive.txtToHive() #替换后的txt文件存入hive #os.system('python /data1/bi/platform/scripts/BI/BISub/bi_yewu_caiwu_argv.py ' + date) #os.system('python /data1/bi/platform/scripts/BI/BISub/bi_zxgs_yunyingjibie.py ' + date)
file_path = os.path.join(root, file) if 'sem_sm_keyword' in file: hive.load_file('to8to_rawdata', 'sem_sm_keyword', file_path, partition_dict) elif 'sem_sm_diyu' in file: hive.load_file('to8to_rawdata', 'sem_sm_diyu', file_path, partition_dict) else: pass try: shutil.rmtree(data_path) except Exception, ex: print ex return True else: pass leave_time += 300 if leave_time > wait_time: return False time.sleep(300) if __name__ == '__main__': date = MyTime.get_date(1) if len(sys.argv) == 2: date = sys.argv[1] if not load_all(date): MyAlarm.send_mail_sms('load sem shenma keyword to hive status:fail')
log.error(log_str) #MyAlarm.send_mail_sms(log_str) # 5.5 假如文件存在 但没导成功,将发送告警 return False def main(date_arg): global FILE_JSON_NAME, FILE_HIVE_PATH ## 3.1、首先设置全局变量,FILE_JSON_NAME = None FILE_HIVE_PATH = None set_file_path(date_arg) ## 3.2、设置文件路径 ,生成文件名列表 FILE_HIVE_LIST try: os.mkdir( FILE_HIVE_PATH ) ## 3.3 创建目录 FILE_HIVE_PATH :/data1/bi/platform/tar/2016-11-16/ClickStream/ except Exception, ex: print str(ex) pass print MyTime.get_local_time( ), '-------------- tar click stream file begin' # from cube import MyTime log.info( 'tar click stream file begin' ) # 写入日志 log = MyLog.MyLog(path='/data1/bi/platform/scripts/BI/ClickStream/log/', name='ClickStream', type='to8to', level='DEBUG') if check_file(): ## 3.4 检查json日志文件是否生成 print MyTime.get_local_time( ), '-------------- tar click stream file success, then process work' log.info('tar click stream file success') log.info('click stream to8to process work') get_file_size(FILE_JSON_NAME) ## 3.5 获得文件大小,这个有点多余 click_stream() ## 3.6 开始清洗 print MyTime.get_local_time(), '-------------- process success' log.info('click stream to8to process work success') if __name__ == '__main__':
for d in dirs: print os.path.join(root, d) for f in files: file = os.path.join(root, f) shell = '/usr/bin/dos2unix ' + file os.popen(shell) hive = MyHiveBin.HiveBin() hive.load_file_single_overwrite( 'to8to_rawdata', file) log.info('Mysql actual kefu yuyue to hive status:ok') break else: log.info('Mysql actual tar file not exists') now_time_stamp = MyTime.datetime_timestamp(MyTime.get_local_time()) if now_time_stamp > latest_time_stamp: log.critical('not find ' + tar_src) MyAlarm.send_mail_sms('Get Mysql actual tar file status:fail!') return False time.sleep(time_rate) return True if __name__ == '__main__': date = MyTime.get_date(1) if len(sys.argv) == 2: date = sys.argv[1] today = MyTime.get_date(0) if load_file(date, today): os.system(next_script)