Ejemplo n.º 1
0
    current_dt = time.strftime("%Y-%m-%dT%H-%M")

    score_df = score_df.selectExpr("'{}' as model_name".format(model_name),
                                   "'{}' as current_dt".format(current_dt),
                                   '*')

    print('scores generated: {}'.format(score_df.count()))

    print('{} saving scores ...'.format(time.strftime("%Y-%m-%d %H:%M:%S")))

    spark_utils.write(conf['target'], score_df)

    print('execution time: {} sec'.format(time.time() - start))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--conf', required=True)
    args, overrides = parser.parse_known_args()

    file_conf = ConfigFactory.parse_file(args.conf, resolve=False)
    overrides = ','.join(overrides)
    over_conf = ConfigFactory.parse_string(overrides)
    conf = over_conf.with_fallback(file_conf)

    sc, sqc = spark_utils.init_session(conf['spark'],
                                       app=os.path.basename(args.conf),
                                       return_context=True)

    run_scorer(sc, sqc, conf)
Ejemplo n.º 2
0
from pyhocon import ConfigFactory

import sparktools.core as spark_utils

start = time.time()

print('{tm} ------------------- {nm} started'.format(
    tm=time.strftime("%Y-%m-%d %H:%M:%S"), nm=os.path.basename(__file__)))

parser = argparse.ArgumentParser()
parser.add_argument('--conf', required=True)
args, overrides = parser.parse_known_args()

file_conf = ConfigFactory.parse_file(args.conf, resolve=False)
overrides = ','.join(overrides)
over_conf = ConfigFactory.parse_string(overrides)
conf = over_conf.with_fallback(file_conf)

sqc = spark_utils.init_session(conf['spark'], app=os.path.basename(args.conf))

print('{tm} moving data...'.format(tm=time.strftime("%Y-%m-%d %H:%M:%S")))

sdf = spark_utils.define_data_frame(conf['source'], sqc)
spark_utils.write(conf['target'], sdf)

print('data set size: {sz}'.format(sz=sdf.count()))
print(
    '{tm} download is finished'.format(tm=time.strftime("%Y-%m-%d %H:%M:%S")))

print('execution time: {} sec'.format(time.time() - start))