Example #1
0
def main():
    if len(sys.argv) == 1:
        dt = TimeUtil.get_yesterday_str()
    elif len(sys.argv) == 2:
        dt = sys.argv[1]
    else:
        logger.error("parameter error")
        sys.exit(1)

    recent_items = DSSMConfigUtil.sample_conf.get_int("recent-items")
    cpu_threshold = DSSMConfigUtil.sample_conf.get_float("cpu-threshold")
    window = DSSMConfigUtil.sample_conf.get_int("window")
    J = DSSMConfigUtil.sample_conf.get_int("J")

    action_path = DSSMConfigUtil.action_conf.get_string("action-path")
    if action_path == "":
        job_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../"))
        action_path = os.path.join(job_path, "data/rec-recall/dssm/action")
    click_path = os.path.join(action_path, "click")

    click_files = click_action_files(click_path, dt, 1)
    logger.info("click_files:{}".format(click_files))

    sample_path = DSSMConfigUtil.sample_conf.get_string("sample-path")
    if sample_path == "":
        job_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../"))
        sample_path = os.path.join(job_path, "data/rec-recall/dssm/sample")

    os.makedirs(sample_path, exist_ok=True)
    logger.info("sample_path:{}".format(sample_path))

    if not FileUtil.files_exists(click_files):
        logger.info("there are click files that do not exist")
        sys.exit(1)

    ma = merge_action(click_files)
    fa = filter_action(ma, window+1, recent_items)
    logger.info("len(ma):{}".format(len(ma)))
    logger.info("len(fa):{}".format(len(fa)))

    items = set_items(ma)
    logger.info("len(items):{}".format(len(items)))

    total_number = len(fa)
    block_number = math.ceil(CpuUtil.cpu_count() * cpu_threshold)
    block_size = FileUtil.block_size(total_number, block_number)

    logger.info("total_number:{}".format(total_number))
    logger.info("block_number:{}".format(block_number))
    logger.info("block_size:{}".format(block_size))

    blocks = split_merge_action(fa, block_size)

    logger.info("len(blocks):{}".format(len(blocks)))

    FileUtil.remove(os.path.join(sample_path, "sample_{}.data".format(dt)))

    del ma, fa
    gc.collect()

    pool = Pool(block_number)
    for block in blocks:
        pool.apply_async(deal_block, args=(block, items, window, J, os.path.join(sample_path, "sample_{}.data".format(dt))))
    pool.close()
    pool.join()

    logger.info("sample_file:{}".format(os.path.join(sample_path, "sample_{}.data".format(dt))))