root_dir = dirname(dirname(module_path)) sys.path.append(path_join(root_dir, 'dstools')) import spark.core as spark_utils parser = argparse.ArgumentParser() parser.add_argument('--conf', required=True) args, overrides = parser.parse_known_args() file_conf = ConfigFactory.parse_file(args.conf, resolve=False) overrides = ','.join(overrides) over_conf = ConfigFactory.parse_string(overrides) conf = over_conf.with_fallback(file_conf) sc, sqc = spark_utils.init_session(conf['spark'], app=os.path.basename(args.conf), return_context=True) pipeline_file = conf.get('pipeline-file', None) if pipeline_file is not None: pipeline_full_path = os.path.join( os.path.dirname(os.path.realpath(args.conf)), pipeline_file) sc.addPyFile(pipeline_full_path) print('{} loading data...'.format(time.strftime("%Y-%m-%d %H:%M:%S"))) sdf = spark_utils.define_data_frame(conf['source'], sqc) sdf = sdf.filter('uid is not null') sdf = sdf.withColumn('uid', sdf.uid.astype('string')) sdf = spark_utils.pandify(sdf)
module_path = os.path.realpath(__file__) root_dir = dirname(dirname(module_path)) sys.path.append(path_join(root_dir, 'dstools')) import spark.core as spark_utils parser = argparse.ArgumentParser() parser.add_argument('--conf', required=True) args, overrides = parser.parse_known_args() file_conf = ConfigFactory.parse_file(args.conf, resolve=False) overrides = ','.join(overrides) over_conf = ConfigFactory.parse_string(overrides) conf = over_conf.with_fallback(file_conf) sqc = spark_utils.init_session(conf['spark'], app=os.path.basename(args.conf)) sdf = spark_utils.define_data_frame(conf['source'], sqc) sdf = sdf.filter('uid is not null') row_count = sdf.count() top_percent = float(conf.get('top-size', '.1')) top_size = int(row_count * top_percent) sdf = sdf.orderBy(sdf.target_proba.desc()) df = spark_utils.limit(sdf, top_size).toPandas() df['current_dt'] = time.strftime('%Y-%m-%dT%H:%M:%S%z')