def etl_fact_macro_details(source_engine, target_engine): """fact_macro_details的etl主函数 从235 tag_detail表etl到240 fact_macro_details表 :param source_engine: 源数据库引擎 :param target_engine: 目标数据库引擎 """ extract = Extract(source_engine, target_engine) transform = Transform() load = Load(target_engine) record = Record(table='fact_macro_detail', record_path='rec.cfg') start_params = record.get_record() divisions = extract.std_divisions() for i in range(start_params['rounds']): start_id = start_params['update_id'] + i * start_params['chunksize'] + 1 end_id = start_params['update_id'] + ( i + 1) * start_params['chunksize'] + 1 tag_details = extract.tag_details(start_id, end_id) if len(tag_details) == 0: continue macro_details = transform.compile_datasets(tag_details, divisions) load.loading(macro_details) update_id = tag_details['id'].max() if tag_details['id'].max( ) else start_params['update_id'] record.update_record(update_id)
def etl_dimension_time(target_engine): """时间维度表主函数 :param target_engine: 目标数据库引擎 """ extract = Extract() transform = Transform() load = Load(target_engine) full_time = extract.gen_full_time() time_table = transform.gen_date(full_time) load.loading(time_table)
def etl_fact_market(*args): """fact_market表主函数 :param args: 按位参数engine_zone_macro,engine_draw,engine_target """ # 初始化 extract,transform和load三个对象 extract = Extract(engine_zone_macro, engine_draw, engine_target) transform = Transform() load = Load(engine_target) # 抽取已经经过etl的商圈 done_market = extract.done_market() df_tag_counts = extract.tag_counts() df_industry = extract.industry() has_dealed = [] for i, sample_tag_counts in df_tag_counts.iterrows(): grandParentId = sample_tag_counts['grandParentId'] if len(grandParentId) != 36: # 判断grandParentId的有效性 logging.warning('Round %d, %s is invalid ,skipped.' % (i, grandParentId)) continue elif grandParentId in done_market: # 判断该商圈是已经经过etl logging.warning('Round %d, %s etl before' % (i, grandParentId)) continue if grandParentId in has_dealed: logging.warning('Round %d, %s etl before' % (i, grandParentId)) continue else: has_dealed.append(grandParentId) # 抽取数据 zone_grandparent = extract.zone_grandparent(grandParentId) if len(zone_grandparent) == 0: logging.warning('Round %d, has no draw samples' % i) continue rent = extract.rent_details(grandParentId) industry_tmp = df_industry[df_industry['grandParentId'] == grandParentId] # 转换数据 rent = transform.rent_calculate(rent) industry_dict = transform.reshape_industry(industry_tmp) # 组合数据 clean = transform.compile_dfs(sample_tag_counts, rent, industry_dict, zone_grandparent) try: load.loading(clean) logging.info('Round %d, %s etl secceed' % (i, grandParentId)) except Exception as e: logging.error('Round %d, %s' % (i, e))
def etl_demension_division(target_engine): """division表的etl主函数 从统计局爬取的标准csv表中抽取数据,载入到数据仓库 :param target_engine:目标数据库引擎 """ extract = Extract() transform = Transform() load = Load(target_engine) logging.info('Initialize three instances') division_datasets = extract.std_divisions() std_districts = transform.std_districts(division_datasets) load.loading(std_districts)
def etl_fact_market(source_engine, target_engine, rec_path): extract = Extract(source_engine, target_engine) transform = Transform() load = Load(target_engine) record = Record('rec.cfg') start_params = record.get_record() unique_marketguid = [] done_market = [] has_dealed = [] for i, grandParentId in enumerate(unique_marketguid): if len(grandParentId) != 36: # 判断grandParentId的有效性 logging.error('Round %d, %s is not valid.' % (i, grandParentId)) continue elif grandParentId in done_market: # 判断该商圈是已经经过etl logging.warning('Round %d, %s etl before' % (i, grandParentId)) continue if grandParentId in has_dealed: logging.warning('Round %d, %s etl before' % (i, grandParentId)) continue else: has_dealed.append(grandParentId) zone_grandparent = extract.zone_grandparent(grandParentId) if len(zone_grandparent) == 0: logging.warning('Round %d, has no draw samples' % i) continue rent = extract.rent_details(grandParentId) industry_tmp = industry[industry['grandParentId'] == grandParentId] # 转换数据 rent = transform.rent_calculate(rent) industry_dict = transform.reshape_industry(industry_tmp) # 组合数据 clean = transform.compile_dfs(sample_tag_counts, rent, industry_dict, zone_grandparent) try: load.loading(clean) except Exception as e: logging.error('Round %d, %s' % (i, e))
def market_to_api2(source, target, record_file='api2.record'): """anti_fraud数据库api2表的etl主函数 :param source: 源数据库引擎 :param target: 目标数据库引擎 :param record_file: 负责记录装载id的文件名,默认为 app2.record """ # 初始化对象 extract = Extract(source, target, record_file) transform = Transform() load = Load(target, record_file) # 抽取数据 market_df = extract.market() draw_samples = extract.draw_samples() # 转换数据 reshaped_market = transform.reshape_market(market_df) aggregated_samples = transform.aggregate_from_samples(draw_samples) api2_df = transform.compile_dfs(reshaped_market, aggregated_samples) # 装载数据 load.loading(api2_df)