Example #1
0
root_dir = dirname(dirname(module_path))
sys.path.append(path_join(root_dir, 'dstools'))

import spark.core as spark_utils

parser = argparse.ArgumentParser()
parser.add_argument('--conf', required=True)
args, overrides = parser.parse_known_args()

file_conf = ConfigFactory.parse_file(args.conf, resolve=False)
overrides = ','.join(overrides)
over_conf = ConfigFactory.parse_string(overrides)
conf = over_conf.with_fallback(file_conf)

sc, sqc = spark_utils.init_session(conf['spark'],
                                   app=os.path.basename(args.conf),
                                   return_context=True)

pipeline_file = conf.get('pipeline-file', None)

if pipeline_file is not None:
    pipeline_full_path = os.path.join(
        os.path.dirname(os.path.realpath(args.conf)), pipeline_file)
    sc.addPyFile(pipeline_full_path)

print('{} loading data...'.format(time.strftime("%Y-%m-%d %H:%M:%S")))

sdf = spark_utils.define_data_frame(conf['source'], sqc)
sdf = sdf.filter('uid is not null')
sdf = sdf.withColumn('uid', sdf.uid.astype('string'))
sdf = spark_utils.pandify(sdf)
Example #2
0
module_path = os.path.realpath(__file__)
root_dir = dirname(dirname(module_path))
sys.path.append(path_join(root_dir, 'dstools'))

import spark.core as spark_utils

parser = argparse.ArgumentParser()
parser.add_argument('--conf', required=True)
args, overrides = parser.parse_known_args()

file_conf = ConfigFactory.parse_file(args.conf, resolve=False)
overrides = ','.join(overrides)
over_conf = ConfigFactory.parse_string(overrides)
conf = over_conf.with_fallback(file_conf)

sqc = spark_utils.init_session(conf['spark'], app=os.path.basename(args.conf))

sdf = spark_utils.define_data_frame(conf['source'], sqc)

sdf = sdf.filter('uid is not null')

row_count = sdf.count()

top_percent = float(conf.get('top-size', '.1'))
top_size = int(row_count * top_percent)

sdf = sdf.orderBy(sdf.target_proba.desc())

df = spark_utils.limit(sdf, top_size).toPandas()

df['current_dt'] = time.strftime('%Y-%m-%dT%H:%M:%S%z')