def get_new_data(config_dict): """ 获取当日新增数据 :param config_dict: :return: """ curr_date = file_utils.get_curr_date() add_folder_name = config_dict["add_folder_name"] add_filename = add_folder_name + "add_data.json" NEW_DATA_LIST = file_utils.get_add_data_id(add_filename) add_data_count = NEW_DATA_LIST.qsize() # logging.info("[data_info]采集日期=%s,计划新增数据采集数据总量=:%s" % (curr_date, add_data_count)) # data_info > save 数据采集总量检查 data_info_save_folder_name = config_dict["data_info_save_folder_name"] file_list = file_utils.get_file_list(data_info_save_folder_name) data_info_count = file_utils.data_info_count(file_list) # logging.info("[data_info]采集日期=%s,实际新增数据采集数据总量=:%s" % (curr_date, data_info_count)) #数据采集前判断是否已经采集完成 if add_data_count == data_info_count: logging.info("采集日期=%s,新增数据采集已经完成!" % (curr_date)) return 1 try: logging.info("开始采集今日[%s]新增数据" % (curr_date)) get_data_info(NEW_DATA_LIST,config_dict) return 1 except BaseException, e: return -1
import check_data # 读取基础配置文件 cf = ConfigParser.ConfigParser() cf.read("../etc/base_config.cfg") #日志文件 LOG_NAME = "data_collection.log" if __name__ == "__main__": # 运行程序基础参数 config_filename = cf.get("default_config", "config_filename") log_name = cf.get("default_config", "log_name") get_type = cf.get("base_config", "get_type") # 该参数暂时未生效,未来可能需要实现方式 data_type = cf.get("base_config", "data_type") root_path = cf.get("base_config", "root_path") curr_date = file_utils.get_curr_date() curr_root_path = config.get_curr_root_path(root_path, data_type, curr_date) tips = "===============================\n" \ "程序运行前先对base_config.ini进行配置:\n" \ "https://github.com/xiaodeme \n" \ "运行日志请查看 logs/data_collection.log \n" \ "===============================" print tips # 日志初始化配置 log_foloder_name = curr_root_path + "/logs/" file_utils.mkdir_path(log_foloder_name) log_utils.log_config(log_foloder_name + LOG_NAME) #1. 初始化程序运行配置基础信息 config.init_config(root_path, data_type, get_type)
def data_process(config_dict): """ 本次采集数据与当前数据库对比,将新增、减少数据存入add、reduct文件夹 :param config_dict: :return: """ add_filename = config_dict["add_folder_name"] + ADD_DATA_FILENAME reduce_filename = config_dict["reduct_folder_name"] + REDUCE_DATA_FILENAME data_list_folder_name = config_dict["data_list_folder_name"] if os.path.exists(add_filename): logging.info("数据分析:本次新增减少[add/reduct]文件已经存储:%s" % (add_filename)) return data_type = cf.get("base_config", "data_type") root_path = cf.get("base_config", "root_path") """ 获取上一天数据总量(上一天可能采集失败,继续循环获取上一天,直到获取数据) """ last_date_num = 1 last_date = file_utils.get_last_date(last_date_num) last_data_list_folder_name = config.get_last_root_path( root_path, data_type, last_date) file_list = file_utils.get_file_list(last_data_list_folder_name + "/data_list/") id_list = file_utils.get_data_info_id(file_list) curr_data_id_list = list(id_list.queue) curr_data_id_list_len = len(curr_data_id_list) while curr_data_id_list_len == 0: logging.warn("数据分析:[%s]数据采集数量: %s" % (last_date, curr_data_id_list_len)) last_date_num += 1 last_date = file_utils.get_last_date(last_date_num) last_data_list_folder_name = config.get_last_root_path( root_path, data_type, last_date) file_list = file_utils.get_file_list(last_data_list_folder_name + "/data_list/") id_list = file_utils.get_data_info_id(file_list) curr_data_id_list = list(id_list.queue) curr_data_id_list_len = len(curr_data_id_list) logging.info("数据分析:[%s]数据采集数量: %s" % (last_date, curr_data_id_list_len)) file_list = file_utils.get_file_list(data_list_folder_name) id_list = file_utils.get_data_info_id(file_list) new_data_id_list = list(id_list.queue) logging.info("数据分析:今天[%s]数据采集数量: %s" % (file_utils.get_curr_date(), len(new_data_id_list))) # 本次新增数据 add_data = list(set(new_data_id_list).difference( set(curr_data_id_list))) # b中有而a中没有的 file_utils.write_file(add_filename, str(add_data)) logging.info("数据分析:本次新增数据:%s" % (len(add_data))) if len(add_data) > 0: logging.info("本次新增数据标识已经保存") # 本次减少数据 reduce_data = list( set(curr_data_id_list).difference(set(new_data_id_list))) # a中有而b中没有的 file_utils.write_file(reduce_filename, str(reduce_data)) logging.info("数据分析:本次减少数据:%s" % (len(reduce_data))) if len(reduce_data) > 0: logging.info("本次减少数据标识已经保存")