def task_finish(self): """ 任务执行结束检测 1.等待任务执行结束,任务队列中无任务且没有进行中的任务 2.执行任务结束后的任务,监控报告发送 :return: """ while True: sql = 'select status, id, topic, job_params from t_job where status in (0,1);' jobs = self.db.query(sql) # TODO 任务设置超时,强制终止时间机制实现 finish_flag = True if len(jobs) <= 0 else False if finish_flag: Logging.info('所有任务执行完成!') # TODO 统一调度入库操作 # TODO 监控告警操作 pass task_waiting = 0 task_running = 0 for x in jobs: if x[0] == 0: task_waiting = task_waiting + 1 if x[0] == 1: task_running = task_running + 1 Logging.info('heartbeat 待执行任务数:', task_waiting, '执行中任务数:', task_running) sleep(30)
def add_store(self, name, plt_name, plt_store_id, login_username=None, url=None, status=1, properties=None): """ 增加店铺 :param name: 店铺名 :param plt_name: 平台名 :param plt_store_id: 平台店铺id :param login_username: 登录名 :param url: 登录地址 :param status: 店铺状态,设置为1,代表有效;设置为0,代表无效 :param properties: 店铺属性列表,该参数需要传入二维数组 :return: """ if self.check_store_name_exists(name): Logging.error('add_store:', name, '店铺名已存在!') return None key = StoreDao().insert(name, plt_name, plt_store_id, login_username, url, status) if properties: for x in properties: StorePropertyDao().insert(key, x[0], x[1], x[2], x[3]) store = self.get_store(key) return store
def _get_data_tabs(self, page_data_id): data_tabs = [] data = DataTabDao().query_by_page_data_id(page_data_id) if data: for row in data: data_tab = self._get_data_tab(row[0]) data_tabs.append(data_tab) return data_tabs else: Logging.error('不存在该page_data_id:', page_data_id)
def check_store_login(self): self.driver.get('https://sycm.taobao.com') Time.sleep(1) current_url = self.driver.current_url if 'login.htm' in current_url: Logging.error('store:', self.store.name, 'current_url:', current_url, '店铺未登录,无法继续取数!') self.error = ErrorEnum.ERROR_1004 raise Exception('store:', self.store.name, '店铺未登录,无法继续取数!') self.login_flag = Time
def _get_data_tab(self, tab_id): data = DataTabDao().query(tab_id) if data: data = data[0] data_tab_column_entity = self._get_data_tab_columns(tab_id) data_tab = DataTabEntity(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data_tab_column_entity) return data_tab else: Logging.error('不存在该tab_id:', tab_id)
def operation_page(self): self.driver.get( 'https://branding.taobao.com/#!/report/index?productid=101005202&effect=15&startdate=2019-06-05&enddate=2019-06-19' ) Time.sleep(3) self.driver.find_element_by_xpath( '//*[@id="brix_12290"]/div[4]/a').click() Time.sleep(3) self.wait_download_finish() Logging.info(self.source_data_list) Logging.info('end')
def _get_page_data_confs(self, page_data_id): data = PageDataConfDao().query_by_page_data_id(page_data_id) if data: page_data_confs = [] for row in data: page_data_confs.append( PageDataConfEntity(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7])) return page_data_confs else: Logging.warning('不存在该page_data_id:', page_data_id)
def _get_data_tab_columns(self, tab_id): data = DataTabColumnDao().query_by_tab_id(tab_id) if data: data_tab_columns = [] for row in data: data_tab_columns.append( DataTabColumnEntity(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10], row[11])) return data_tab_columns else: Logging.error('不存在该data_tab_id:', tab_id)
def get_page(self, page_id): """ 获取需要抓取的页面信息 :param page_id: 页面id :return: page实体对象 """ data = PageDao().query(page_id) if data: data = data[0] page = PageEntity(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8]) return page else: Logging.error('page_id:', page_id, ' 不存在!')
def _locate_page(self): """ 定位到指定取数的页面 :param url: 指定抓取页面的url :return: True/False """ try: self.driver.get(self.page.url) # 第一次请求到达平台默认页 self.driver.close(self.page.url) self.driver.get(self.page.url) # 第二次请求是为了到达指定的爬虫页 except Exception as e: Logging.error(e) self.error = ErrorEnum.ERROR_3001 return False return True
def __init__(self, store_id, page_data_id, port): """ 初始化爬虫任务所需的信息 1.实例化对象:Store、PageData、Table 2.环境初始化 3.web_driver 连接确认 4.web_driver 店铺LOGIN确认,确认浏览正常并店铺已登录成功时置login_flag=True :param store_id: 店铺id,用来获取店铺对象 :param page_data_id: 抓取的页面数据块id,用来获取页面数据块对象 :param port: 已开启的浏览器服务端口 """ self.error = None self.login_flag = False try: self.store = StoreService().get_store(store_id) self.page_data = PageDataService().get_page_data(page_data_id) self.page = self.page_data.page self.db = DataBase() self.port = port self.FILE_PART_PATH = self.store.name + '/' + self.page_data.name + '/' + self.page_data.data_update_freq self.FILE_DOWNLOAD_PATH = setting.FILE_DOWNLOAD_PATH_PREFIX + '/' + self.store.name self.FILE_PROCESS_PATH = setting.FILE_PROCESS_PATH_PREFIX + '/' + self.FILE_PART_PATH self.FILE_BACKUP_PATH = setting.FILE_BACKUP_PATH_PREFIX + '/' + self.FILE_PART_PATH if not os.path.exists(self.FILE_DOWNLOAD_PATH): os.makedirs(self.FILE_DOWNLOAD_PATH) if not os.path.exists(self.FILE_PROCESS_PATH): os.makedirs(self.FILE_PROCESS_PATH) if not os.path.exists(self.FILE_BACKUP_PATH): os.makedirs(self.FILE_BACKUP_PATH) # 下载目录清理 self.clear_download_path() # 初始化webdriver,判断是否已登录 self.driver = None self.init_web_driver() self.check_store_login() # 数据维度字典 self.data_dimension_dict = {} # 下载文件取数时需要 self.file_names = [] # 单文件、单数据表存储,例:[DataFrame] # 多文件/多sheet、单数据表存储,例:[DataFrame, DataFrame, DataFrame] # TODO 暂无忽略 # 多文件/多sheet、多数据表存储:判断条件 page_data.is_multiple_tab() # 例:[{'tab.name', [DataFrame]}, {'tab.name', [DataFrame, DataFrame]}] self.source_data_list = [] self.data_list = [] except Exception as e: Logging.error(e) self.error = ErrorEnum.ERROR_1000
def get_store(self, store_id): """ 获取单个店铺对象 :param store_id: 店铺id :return: 店铺实体对象 """ data = StoreDao().query(store_id) if data: data = data[0] property_entity = self._get_store_properties(store_id) store = StoreEntity(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], property_entity) return store else: Logging.error('店铺id不存在:', store_id)
def _get_store_properties(self, store_id): """ 获取店铺的属性列表 :param store_id: 店铺id :return: 返回该店铺所有属性组成的二维数组 """ data = StorePropertyDao().query_by_store_id(store_id) if data: store_properties = [] for row in data: store_properties.append( StorePropertyEntity(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7])) return store_properties else: Logging.error('该店铺id不存在:', store_id)
def operation_data_input(self): """ 将读取到的data_frame按照字段名写入到数据库 """ try: df = self.data_list[0] file_col_names = tuple(df.columns.tolist()) data_list = list(df.itertuples( index=False, name=None)) # 将data_frame每一行转化为元组放入列表中 insert_sql = "insert into {} {} values (%s{})".format( self.page_data.data_tabs[0].name, file_col_names, ',%s' * (df.shape[1] - 1)) self.db.insert_many(insert_sql, data_list) self.db.commit() except Exception as e: Logging.error(e) self.error = ErrorEnum.ERROR_5002
def __init__(self, name, param={}): """ 对象/任务实例化 :param name: 对象标识,规则:从目录至最终对象,handle.xxx.Obj :param param: 对象实例化参数,类型:dict """ self.error = None self.obj = None self.obj_name = name self.obj_param = param try: Logging.info(self.obj_name, self.obj_param, ' 实例化 start!') if self.obj_name == 'handle.task_creator.TaskCreator': self.obj = TaskCreator() elif self.obj_name == 'handle.login.tb_login.TaoLogin': try: self.obj = tb_login() except Exception as e: Logging.error(e) self.error = ErrorEnum.ERROR_2000 # ========================== 抓取页面实例配置 START ========================== elif self.obj_name == 'handle.website.subway.report.SubReportDay': self.obj = SpreadReportDay(self.obj_param['store_id'], self.obj_param['page_data_id'], self.obj_param['port']) elif self.obj_name == 'handle.website.subway.direct_report.SpreadReportDay': self.obj = SpreadReportDay1(self.obj_param['store_id'], self.obj_param['page_data_id'], self.obj_param['port']) # ========================== 抓取页面实例配置 END ========================== else: self.error = ErrorEnum.ERROR_9001 self.error.value.set_msg(('未匹配到任务实例 name:' + self.obj_name + ',param:' + self.obj_param)) if self.is_success(): Logging.info(self.obj_name, self.obj_param, ' 实例化成功 end!') else: Logging.info(self.obj_name, self.obj_param, ' 实例化失败 error:', self.error, ' end!') except Exception as e: Logging.error(e) if self.is_success() and self.obj and self.obj.error: self.error = self.obj.error elif self.is_success(): self.error = ErrorEnum.ERROR_9999
def get_task(self): """获取任务""" # TODO 数据库事务操作 sql = 'select id, job_params from t_job where status = 0 order by job_sort,RAND();' jobs = self.db.query(sql) if len(jobs) > 0: job = jobs[0] Logging.info('总任务数:', len(jobs), ' 获取任务:', job) job_id = int(job[0]) store_id = int(job[1].split('|')[0]) _page_data_ids = job[1].split('|')[1].split(',') _page_data_ids.remove('') shuffle(_page_data_ids) page_data_ids = [] for s in _page_data_ids: page_data_ids.append(int(s)) return job_id, store_id, page_data_ids return None, None, None
def operation_data_process(self): """ 解析处理数据 :return: True/False """ try: # 从数据库读取目标表的所有字段名 check_field_names = [] # 存储需要进行比对的字段名 db_field_names = [] # 存储数据库中表的所有字段名 default_add_field = [] # 存储默认需要添加的字段名 for data_tab_column in self.page_data.data_tabs[ 0].data_tab_columns: if data_tab_column.check_col_name is not None: check_field_names.append(data_tab_column.check_col_name) if data_tab_column.check_col_name is None: default_add_field.append(data_tab_column.col_name) db_field_names.append(data_tab_column.col_name) # check_field_names.sort(reverse=True) # df = self.source_data_list[0] # 取出读取到的data_frame #添加默认字段并赋值 df = pd.concat( [df, pd.DataFrame(columns=self.default_add_field)], sort=False) df['店铺id'] = self.store.id df['店铺名'] = self.store.name df['日期'] = df['_日期'] df['文件路径'] = self.FILE_BACKUP_PATH df['文件sheet'] = 'sheet' df['转化周期'] = '15天累计数据' df['报表类型'] = '宝贝' df['入库时间'] = get_current_timestamp() df['取数时间'] = get_current_timestamp() file_col_names = df.columns.tolist() # 比较文件数据中的字段与数据库表中字段的差异 # 多出或者减少的字段需处理到告警信息中 increase_field = list(set(file_col_names) - set(check_field_names)) reduce_field = list(set(check_field_names) - set(file_col_names)) self.data_list.append(df) except Exception as e: Logging.error(e) self.error = ErrorEnum.ERROR_5001 return False return True
def init_web_driver(self): """ 根据端口获取浏览器driver :return: True/False """ try: chrome_options = Options() chrome_options.add_experimental_option( "debuggerAddress", "127.0.0.1:{}".format(self.port)) self.driver = webdriver.Chrome(chrome_options=chrome_options) Logging.info('{} - Chrome[{}]连接成功。'.format(self.store.name, self.port)) except Exception as e: print(e) Logging.error('port:{} 无法接管浏览器'.format(self.port)) self.error = ErrorEnum.ERROR_1003 raise Exception return True
def get_page_data(self, page_data_id): """ 获取页面数据块信息 :param page_data_id: 页面数据块id :return: 页面数据块的实体对象 """ data = PageDataDao().query(page_data_id) if data: data = data[0] page_data_confs = self._get_page_data_confs(page_data_id) page = PageService().get_page(data[1]) data_tabs = self._get_data_tabs(page_data_id) page_data = PageDataEntity(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9], data[10], page_data_confs, page, data_tabs) return page_data else: Logging.error('不存在该page_data_id:', page_data_id) return None
def df_effective_by_starting_position(self, starting_position, source_df: pd.DataFrame): """ 根据启始位置,获取有效数据 :param starting_position: 起始位置 :param source_df: 原始数据 :return: DataFrame """ get_data_flag = False data_cols = None # 数据表title data_list = [] # 数据表内容 for index, row in source_df.T.iteritems(): values = row.values if get_data_flag: data_list.append(values) if values[0] == starting_position: get_data_flag = True data_cols = values if len(data_list) == 0: Logging.warning('无数据!') return True df = pd.DataFrame(data_list, columns=data_cols) return df
def operation_page(self): """ 报表条件筛选 """ try: start_date, end_date = get_day_report_rule1() # 各控件筛选操作 self._operator_name_control() self._operator_time_control(start_date, end_date) except Exception as e: Logging.error(e) self.error = ErrorEnum.ERROR_3002 try: download_url = 'https://subway.simba.taobao.com/#!/report/bpreport/download' self.web_driver.get(download_url) # 获取总页数 page_num = self.web_driver.find_element_in_xpath( '//*[@id="brix_brick_291"]/div[2]/div[2]/span[2]').text file_name = 'RPA' + date_to_string(get_current_timestamp(), '%Y%m%d%H%M%S') for x in range(page_num): download_url = 'https://subway.simba.taobao.com/#!/report/bpreport/download' + '?page={}'.format( x) self.web_driver.get(download_url) if self.web_driver.find_element_in_xpath( '//*[@id="brix_brick_334"]/tbody//td[contain(text(), "{}")]' .format(file_name)): self.web_driver.find_element_in_xpath( '//*[@id="brix_brick_334"]/tbody//td[contains(text(), "{}")]/../td/a[contains(@class, "mr10")]' .format(file_name)).click() time.sleep(5) break self.wait_download_finish() except Exception as e: Logging.error(e) self.error = ErrorEnum.ERROR_3003
def wait_download_finish(self, file_type=None): """ 根据文件前缀规则匹配,文件是否下载完成 :param file_type: :return: """ # 文件下载超时3分钟 timeout_num = 180 while timeout_num >= 0: # 匹配到的文件数量 match_file_cnt = 0 files = os.listdir(self.FILE_DOWNLOAD_PATH) for file in files: file_path = os.path.join(self.FILE_DOWNLOAD_PATH, file) # 文件下载中,文件后缀 if '.crdownload' in file or '.tmp' in file: Time.sleep(1) timeout_num = timeout_num - 1 continue match_file_cnt = 0 if self.page_data.rule_read_file_prefix is None and os.path.isfile( file_path): match_file_cnt = match_file_cnt + 1 elif file.find(self.page_data.rule_read_file_prefix ) == 0 and os.path.isfile(file_path): match_file_cnt = match_file_cnt + 1 if match_file_cnt == 0: Time.sleep(1) timeout_num = timeout_num - 1 continue elif match_file_cnt == 1: self.file_names.append(file) # 将文件移到处理目录 if self.page_data.rule_save_path_suffix is None: file_process_path = self.FILE_PROCESS_PATH else: path_suffix = self.page_data.rule_save_path_suffix for key in self.data_dimension_dict.keys(): path_suffix = path_suffix.replace( key, self.data_dimension_dict[key]) file_process_path = self.FILE_PROCESS_PATH + '/' + path_suffix if not os.path.exists(file_process_path): os.makedirs(file_process_path) remote_path = os.path.join(file_process_path, file) # TODO 目标文件已存在文件需重命名,时间戳.原文件名 if os.path.exists(remote_path): os.remove(remote_path) shutil.move(file_path, remote_path) # 移动文件 Logging.info("move %s -> %s" % (file_path, remote_path)) # 文件读取 # TODO 解压文件操作,多文件、多sheet操作 # TODO 通用需要文件类型配置,常规文件类型支持 if file_type is None: if file[-3:] == 'csv': file_type = 'csv' elif file[-3:] == 'xls' or file[-4:] == 'xlsx': file_type = 'excel' if file_type == 'excel': df = pd.read_excel(remote_path) elif file_type == 'csv': df = pd.read_csv(remote_path) else: Logging.error('解析文件类型,未找到!') raise Exception('解析文件类型,未找到!') self.source_data_list.append(df) return True else: raise Exception('文件下载失败') return False
def insert_many(self, sql, data_list): Logging.info('db.insert_many sql:', sql, data_list) self.db_cur.executemany(sql, data_list)
def delete(self, sql): Logging.info('db.delete sql:', sql) self.db_cur.execute(sql)
def insert(self, sql, tuple_data): Logging.info('db.insert sql:', sql, tuple_data) self.db_cur.execute(sql, tuple_data) data = self.query('select last_insert_id() as id') key = data[0][0] return key
def execute(self, sql): Logging.info('db.execute sql:', sql) result = self.db_cur.execute(sql) self.commit() return result
def query(self, sql): Logging.info('db.query sql:', sql) self.db_cur.execute(sql) data = self.db_cur.fetchall() return data
def operation_data_process(self): Logging.info(self.data_list) Logging.info('operation_data_process')
def run(self, func, param={}): """ 对象任务执行调度控制模板 :param func: :return: """ results = None try: Logging.info(self.obj_name, func, param, ' 步骤执行 start!') if self.obj_name == 'handle.task_creator.TaskCreator' and func == 'task_init': results = self.obj.task_init() elif self.obj_name == 'handle.task_creator.TaskCreator' and func == 'task_added': results = self.obj.task_added() elif self.obj_name == 'handle.task_creator.TaskCreator' and func == 'get_task': results = self.obj.get_task() elif self.obj_name == 'handle.task_creator.TaskCreator' and func == 'task_finish': results = self.obj.task_finish() elif self.obj_name == 'handle.task_creator.TaskCreator' and func == 'task_set_start': results = self.obj.task_set_start(param) elif self.obj_name == 'handle.task_creator.TaskCreator' and func == 'task_set_end': results = self.obj.task_set_end(param) elif self.obj_name == 'handle.login.tb_login.TaoLogin' and func == 'run': results = self.obj.run(param) elif self.obj_name.find( 'handle.website') == 0 and func == 'operation_page': try: results = self.obj.operation_page() except Exception as e: Logging.error(e) self.error = ErrorEnum.ERROR_3000 elif self.obj_name.find( 'handle.website' ) == 0 and func == 'operation_data_process': try: results = self.obj.operation_data_process() except Exception as e: Logging.error(e) self.error = ErrorEnum.ERROR_4000 elif self.obj_name.find( 'handle.website') == 0 and func == 'operation_data_input': try: results = self.obj.operation_data_input() except Exception as e: Logging.error(e) self.error = ErrorEnum.ERROR_5000 elif self.obj_name.find( 'handle.website') == 0 and func == 'operation_data_backup': try: results = self.obj.operation_data_backup() except Exception as e: Logging.error(e) self.error = ErrorEnum.ERROR_6000 else: self.error = ErrorEnum.ERROR_9002 self.error.value.set_msg( ('未匹配到任务func name:' + self.obj_name + ',func:' + func)) if self.is_success(): Logging.info(self.obj_name, func, param, ' 步骤执行成功 end!') else: Logging.info(self.obj_name, func, param, ' 步骤执行失败 error:', self.error, ' end!') except Exception as e: Logging.error(e) if self.is_success() and self.obj and self.obj.error: self.error = self.obj.error elif self.is_success(): self.error = ErrorEnum.ERROR_9999 raise Exception return results
def worker_task_run(): tc = TaskController('handle.task_creator.TaskCreator') job_id, store_id, page_data_ids = tc.run('get_task') while job_id: flag = tc.run('task_set_start', {'job_id': job_id}) # 任务获取成功 if not flag: # 继续获取任务 Logging.info('job:', job_id, store_id, page_data_ids, ' 任务领取慢了一拍,继续获取其他任务!') job_id, store_id, page_data_ids = tc.run('get_task') continue try: port = None for page_data_id in page_data_ids: # step1:Worker:取数-初始化任务 param = { 'store_id': store_id, 'page_data_id': page_data_id, 'port': port, 'job_id': job_id } task = TaskController( 'handle.website.subway.report.SubReportDay', param) # 店铺未登录 if not task.obj.login_flag: # step2:Worker:取数-登录操作 login_tc = TaskController('handle.login.tb_login.TaoLogin', task.store) login_tc.run('run') if login_tc.is_success(): port = login_tc.port param['port'] = port task = TaskController( 'handle.website.subway.report.SubReportDay', param) else: Logging.error('param:', param, '登录失败!') raise Exception('param:', param, '登录失败!') if not task.is_success(): Logging.error('param:', param, '任务初始化失败!') raise Exception('param:', param, '任务初始化失败!') try: # step3:Worker:取数-页面操作 task.run('operation_page') if not task.is_success(): Logging.error('param:', param, '取数-页面操作失败!') raise Exception('param:', param, '取数-页面操作失败!') # step4:Worker:取数-页面文件下载及读取 task.run('operation_page_download') if not task.is_success(): Logging.error('param:', param, '取数-页面文件下载及读取失败!') raise Exception('param:', param, '取数-页面文件下载及读取失败!') # step5:Worker:取数-数据处理 task.run('operation_data_process') if not task.is_success(): Logging.error('param:', param, '取数-数据处理失败!') raise Exception('param:', param, '取数-数据处理失败!') # step6:Worker:取数-数据入库 task.run('operation_data_input') if not task.is_success(): Logging.error('param:', param, '取数-数据入库失败!') raise Exception('param:', param, '取数-数据入库失败!') # step7:Worker:取数-数据备份 task.run('operation_data_backup') if not task.is_success(): Logging.error('param:', param, '取数-数据备份失败!') raise Exception('param:', param, '取数-数据备份失败!') except Exception as e: Logging.error(e) Logging.error('param:', param, ' 页面取数过程失败!') tc.run('task_set_end', {'job_id': job_id, 'result': 'success'}) except Exception as e: Logging.error(e) Logging.error('job_id:', job_id, ' 任务执行失败!') tc.run('task_set_end', {'job_id': job_id, 'result': 'fail'}) # 继续获取任务 job_id, store_id, page_data_ids = tc.run('get_task')