Ejemplo n.º 1
0
def etl_fact_macro_details(source_engine, target_engine):
    """fact_macro_details的etl主函数

    从235 tag_detail表etl到240 fact_macro_details表
    :param source_engine: 源数据库引擎
    :param target_engine: 目标数据库引擎
    """
    extract = Extract(source_engine, target_engine)
    transform = Transform()
    load = Load(target_engine)
    record = Record(table='fact_macro_detail', record_path='rec.cfg')

    start_params = record.get_record()
    divisions = extract.std_divisions()

    for i in range(start_params['rounds']):
        start_id = start_params['update_id'] + i * start_params['chunksize'] + 1
        end_id = start_params['update_id'] + (
            i + 1) * start_params['chunksize'] + 1

        tag_details = extract.tag_details(start_id, end_id)
        if len(tag_details) == 0:
            continue
        macro_details = transform.compile_datasets(tag_details, divisions)
        load.loading(macro_details)
        update_id = tag_details['id'].max() if tag_details['id'].max(
        ) else start_params['update_id']
        record.update_record(update_id)
Ejemplo n.º 2
0
def etl_dimension_time(target_engine):
    """时间维度表主函数

    :param target_engine: 目标数据库引擎
    """
    extract = Extract()
    transform = Transform()
    load = Load(target_engine)

    full_time = extract.gen_full_time()
    time_table = transform.gen_date(full_time)
    load.loading(time_table)
Ejemplo n.º 3
0
def etl_fact_market(*args):
    """fact_market表主函数
    
    :param args: 按位参数engine_zone_macro,engine_draw,engine_target
    """
    # 初始化 extract,transform和load三个对象
    extract = Extract(engine_zone_macro, engine_draw, engine_target)
    transform = Transform()
    load = Load(engine_target)

    # 抽取已经经过etl的商圈
    done_market = extract.done_market()
    df_tag_counts = extract.tag_counts()
    df_industry = extract.industry()
    has_dealed = []

    for i, sample_tag_counts in df_tag_counts.iterrows():

        grandParentId = sample_tag_counts['grandParentId']
        if len(grandParentId) != 36:  # 判断grandParentId的有效性
            logging.warning('Round %d, %s is invalid ,skipped.' %
                            (i, grandParentId))
            continue

        elif grandParentId in done_market:  # 判断该商圈是已经经过etl
            logging.warning('Round %d, %s etl before' % (i, grandParentId))
            continue

        if grandParentId in has_dealed:
            logging.warning('Round %d, %s etl before' % (i, grandParentId))
            continue
        else:
            has_dealed.append(grandParentId)

        # 抽取数据
        zone_grandparent = extract.zone_grandparent(grandParentId)
        if len(zone_grandparent) == 0:
            logging.warning('Round %d, has no draw samples' % i)
            continue
        rent = extract.rent_details(grandParentId)
        industry_tmp = df_industry[df_industry['grandParentId'] ==
                                   grandParentId]
        # 转换数据
        rent = transform.rent_calculate(rent)
        industry_dict = transform.reshape_industry(industry_tmp)
        # 组合数据
        clean = transform.compile_dfs(sample_tag_counts, rent, industry_dict,
                                      zone_grandparent)
        try:
            load.loading(clean)
            logging.info('Round %d, %s etl secceed' % (i, grandParentId))
        except Exception as e:
            logging.error('Round %d, %s' % (i, e))
Ejemplo n.º 4
0
def etl_demension_division(target_engine):
    """division表的etl主函数

    从统计局爬取的标准csv表中抽取数据,载入到数据仓库
    :param target_engine:目标数据库引擎
    """
    extract = Extract()
    transform = Transform()
    load = Load(target_engine)
    logging.info('Initialize three instances')

    division_datasets = extract.std_divisions()
    std_districts = transform.std_districts(division_datasets)
    load.loading(std_districts)
Ejemplo n.º 5
0
def etl_fact_market(source_engine, target_engine, rec_path):

    extract = Extract(source_engine, target_engine)
    transform = Transform()
    load = Load(target_engine)
    record = Record('rec.cfg')

    start_params = record.get_record()
    unique_marketguid = []
    done_market = []
    has_dealed = []

    for i, grandParentId in enumerate(unique_marketguid):

        if len(grandParentId) != 36:  # 判断grandParentId的有效性
            logging.error('Round %d, %s is not valid.' % (i, grandParentId))
            continue

        elif grandParentId in done_market:  # 判断该商圈是已经经过etl
            logging.warning('Round %d, %s etl before' % (i, grandParentId))
            continue

        if grandParentId in has_dealed:
            logging.warning('Round %d, %s etl before' % (i, grandParentId))
            continue
        else:
            has_dealed.append(grandParentId)

        zone_grandparent = extract.zone_grandparent(grandParentId)
        if len(zone_grandparent) == 0:
            logging.warning('Round %d, has no draw samples' % i)
            continue

        rent = extract.rent_details(grandParentId)
        industry_tmp = industry[industry['grandParentId'] == grandParentId]
        # 转换数据
        rent = transform.rent_calculate(rent)
        industry_dict = transform.reshape_industry(industry_tmp)
        # 组合数据
        clean = transform.compile_dfs(sample_tag_counts, rent, industry_dict,
                                      zone_grandparent)
        try:
            load.loading(clean)
        except Exception as e:
            logging.error('Round %d, %s' % (i, e))
Ejemplo n.º 6
0
def market_to_api2(source, target, record_file='api2.record'):
    """anti_fraud数据库api2表的etl主函数
    
    :param source: 源数据库引擎
    :param target: 目标数据库引擎
    :param record_file: 负责记录装载id的文件名,默认为 app2.record
    """
    # 初始化对象
    extract = Extract(source, target, record_file)
    transform = Transform()
    load = Load(target, record_file)

    # 抽取数据
    market_df = extract.market()
    draw_samples = extract.draw_samples()

    # 转换数据
    reshaped_market = transform.reshape_market(market_df)
    aggregated_samples = transform.aggregate_from_samples(draw_samples)
    api2_df = transform.compile_dfs(reshaped_market, aggregated_samples)

    # 装载数据
    load.loading(api2_df)