def main(): if len(sys.argv) == 1: dt = TimeUtil.get_yesterday_str() elif len(sys.argv) == 2: dt = sys.argv[1] else: logger.error("parameter error") sys.exit(1) recent_items = DSSMConfigUtil.sample_conf.get_int("recent-items") cpu_threshold = DSSMConfigUtil.sample_conf.get_float("cpu-threshold") window = DSSMConfigUtil.sample_conf.get_int("window") J = DSSMConfigUtil.sample_conf.get_int("J") action_path = DSSMConfigUtil.action_conf.get_string("action-path") if action_path == "": job_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")) action_path = os.path.join(job_path, "data/rec-recall/dssm/action") click_path = os.path.join(action_path, "click") click_files = click_action_files(click_path, dt, 1) logger.info("click_files:{}".format(click_files)) sample_path = DSSMConfigUtil.sample_conf.get_string("sample-path") if sample_path == "": job_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")) sample_path = os.path.join(job_path, "data/rec-recall/dssm/sample") os.makedirs(sample_path, exist_ok=True) logger.info("sample_path:{}".format(sample_path)) if not FileUtil.files_exists(click_files): logger.info("there are click files that do not exist") sys.exit(1) ma = merge_action(click_files) fa = filter_action(ma, window+1, recent_items) logger.info("len(ma):{}".format(len(ma))) logger.info("len(fa):{}".format(len(fa))) items = set_items(ma) logger.info("len(items):{}".format(len(items))) total_number = len(fa) block_number = math.ceil(CpuUtil.cpu_count() * cpu_threshold) block_size = FileUtil.block_size(total_number, block_number) logger.info("total_number:{}".format(total_number)) logger.info("block_number:{}".format(block_number)) logger.info("block_size:{}".format(block_size)) blocks = split_merge_action(fa, block_size) logger.info("len(blocks):{}".format(len(blocks))) FileUtil.remove(os.path.join(sample_path, "sample_{}.data".format(dt))) del ma, fa gc.collect() pool = Pool(block_number) for block in blocks: pool.apply_async(deal_block, args=(block, items, window, J, os.path.join(sample_path, "sample_{}.data".format(dt)))) pool.close() pool.join() logger.info("sample_file:{}".format(os.path.join(sample_path, "sample_{}.data".format(dt))))