Beispiel #1
0
def create_context():
    '''
    Creates spark context

    Returns:
        SparkSession

    '''
    conf = SparkConf()
    conf.set('spark.sql.shuffle.partitions', 100)
    conf.set('spark.sql.broadcastTimeout', 1200)
    conf.set('spark.shuffle.service.enabled', 'true')
    conf.set('spark.executor.cores', 2)
    conf.set('spark.executor.instances', 4)
    conf.set('spark.executor.memory', '2G')
    conf.set('spark.driver.cores', 2)
    conf.set('spark.driver.memory', '2G')

    spark = SparkSession.builder \
        .appName('Recon') \
        .config(conf=conf) \
        .enableHiveSupport() \
        .getOrCreate()

    hivecontext = HiveContext(spark.sparkContext)
    hivecontext.setConf('hive.exec.dynamic.partition', 'true')
    hivecontext.setConf('hive.exec.dynamic.partition.mode', 'nonstrict')

    return spark
Beispiel #2
0
def get_spark_test():
    conf = SparkConf()
    sc = SparkContext("local[4]", appName="youzan-algrithm", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex_test """)
    sql_context.setConf("spark.sql.shuffle.partitions", "1")

    return sc, sql_context
Beispiel #3
0
def get_spark_test():
    conf = SparkConf()
    sc = SparkContext("local[4]", appName="youzan-algrithm", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex_test """)
    sql_context.setConf("spark.sql.shuffle.partitions", "1")

    return sc, sql_context
Beispiel #4
0
def get_spark(num =4 , cores =4 , mem = "32g"):
    conf = SparkConf()
    conf.set("spark.executor.instances", "%d"%  num)
    conf.set("spark.executor.cores", "%d" % cores)
    conf.set("spark.executor.memory", "%s" % mem)
    sc = SparkContext(appName="youzan-algrithm", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex """)
    sql_context.setConf("spark.sql.shuffle.partitions", "16")

    return sc, sql_context
Beispiel #5
0
def get_spark(num=4, cores=4, mem="32g"):
    conf = SparkConf()
    conf.set("spark.executor.instances", "%d" % num)
    conf.set("spark.executor.cores", "%d" % cores)
    conf.set("spark.executor.memory", "%s" % mem)
    sc = SparkContext(appName="youzan-algrithm", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex """)
    sql_context.setConf("spark.sql.shuffle.partitions", "16")

    return sc, sql_context
Beispiel #6
0
import sys

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")
sys.path.append(local_path)

from pyspark import SQLContext, SparkConf, HiveContext
from pyspark import SparkContext

from post2 import adj, rlt


def run(sc, sql_context, is_hive):
    adj.main(sc, sql_context, is_hive=True)
    rlt.main(sc, sql_context, is_hive=True)


if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "16")
    conf.set("spark.executor.cores", "16")
    conf.set("spark.executor.memory", "8g")

    sc = SparkContext(appName="bintrade.post2.post_run",
                      master="yarn-client",
                      conf=conf)
    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "32")
    sqlContext.sql("use fex")
    run(sc, sqlContext, is_hive=True)
Beispiel #7
0
    hiveCtx.sql(sql).registerTempTable("temp_table")

    insert_sql = """
        insert overwrite table {table_name} partition(dt='{dt}')
        select  * from temp_table
        """.format(table_name=table_name, dt=dt_str)
    print("insert_sql:\n" + insert_sql)
    hiveCtx.sql(insert_sql)


if __name__ == "__main__":
    conf = SparkConf()
    sc = SparkContext(conf=conf, appName="sp-tfidf")
    sc.setLogLevel("WARN")
    hiveCtx = HiveContext(sc)
    hiveCtx.setConf('spark.shuffle.consolidateFiles', 'true')
    hiveCtx.setConf('spark.shuffle.memoryFraction', '0.4')
    hiveCtx.setConf('spark.sql.shuffle.partitions', '1000')
    if len(sys.argv) == 1:
        dt = datetime.datetime.now() + datetime.timedelta(-1)
    else:
        dt = datetime.datetime.strptime(sys.argv[1], "%Y%m%d").date()

    dt_str = dt.strftime("%Y-%m-%d")
    yest_dt = dt + datetime.timedelta(-30)
    yest_str = yest_dt.strftime("%Y-%m-%d")

    hiveCtx.sql("use app")
    create_table(hiveCtx)
    getQuery(hiveCtx)
Beispiel #8
0
# author [email protected]
import os
import sys

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")
sys.path.append(local_path)

from pyspark import SQLContext, SparkConf, HiveContext
from pyspark import SparkContext

from ml import  diff_feature_reg,diff_train

def run(sc, sql_context, is_hive):
    diff_feature_reg.main(sc, sql_context, is_hive = True)
    diff_train.main(sc, sql_context, is_hive = True)

if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "32g")

    sc = SparkContext(appName="bintrade_candidate", master="yarn-client", conf=conf)
    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "32")

    sqlContext.sql("use fex")

    run(sc, sqlContext, is_hive=True)
from __future__ import print_function

try:
    import findspark

    findspark.init()
    import pyspark

    sc = pyspark.SparkContext()
    sc.setLogLevel('WARN')
    print("spark context created")

    from pyspark import SparkConf, SparkContext, HiveContext
    from pyspark.sql import SQLContext

    sqlc = SQLContext(sc)
    sqlh = HiveContext(sc)
    sqlh.setConf("spark.sql.parquet.compression.codec.", "gzip")

except:
    print("spark context exists")
Beispiel #10
0
def get_sql_context(sc):
    sqlContext = HiveContext(sc)
    sqlContext.setConf("hive.exec.dynamic.partition", "true")
    sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
    return sqlContext
Beispiel #11
0
def prod_src():
    return {
        "psg_train": spark.table("prod_data.psg_train"),
        "psg_test": spark.table("prod_data.psg_test"),
        "psg_dev": spark.table("prod_data.psg_dev")
    }


def prod_dst():
    return {
        "psg_result": "prod_data.psg_result"
    }


if __name__ == '__main__':
    spark = SparkSession.builder.appName("calc_06_task").enableHiveSupport().getOrCreate()
    spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
    hivecontext = HiveContext(spark.sparkContext)
    hivecontext.setConf("hive.exec.dynamic.partition", "true")
    hivecontext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
    spark.sparkContext.setCheckpointDir("hdfs:///user/airflow/psg/calc_06_task")

    opts = {
        'from_dt': sys.argv[1],
        "to_dt": "9999-12-31"
    }

    update_last_partition(prod_dst(), opts["from_dt"], opts["to_dt"])
    calc_06(prod_src(), prod_dst(), opts["from_dt"], opts["to_dt"])

Beispiel #12
0
    sqlstr = sqlstr[: len(sqlstr)-2]
    sqlstr += "\n) stored as orc"
    print sqlstr

    sql_context.sql(sqlstr)
    df.insertInto(tableName, overwrite)



if __name__ == '__main__':
    #log.debug("debug")
    #a = eval("(1,[2,3])")
    #print "xxxxxxx",a[1][0]
    #a = {1: 1.0, 3: 5.5}
    #str_a = str(a)
    #a = eval(str_a)
    #print a[1]

    #print json.loads("""{1:1}""")
    sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature")
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex_test """)
    sql_context.setConf("spark.sql.shuffle.partitions", "1")


    ldict = [{"symbol":"AAA", "date":"2010-01-01", "close":1.0}, {"symbol":"AAA","date":"2010-01-01", "close":1.0}]

    df = sql_context.createDataFrame(ldict)
    dfToTableWithPar(sql_context, df,  "test_eod_AAA")

Beispiel #13
0
    if len(sys.argv) < 6:
        print('Input Parameter missing', file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName='SCD' + sys.argv[3])
    sqlContext = HiveContext(sc)
    tgt_schema = sys.argv[1]
    tgt_tbl_nm = sys.argv[2]
    src_schema = sys.argv[1]
    src_tbl_nm = sys.argv[3]
    load_dt = sys.argv[4]
    hist_delta = sys.argv[5]
    src_schema_tbl = src_schema + '.' + src_tbl_nm
    tgt_schema_tbl = tgt_schema + '.' + tgt_tbl_nm
    tgt_schema_stg_tbl = tgt_schema + '.' + tgt_tbl_nm + '_tgt'

    sqlContext.setConf("hive.exec.dynamic.partition", "true")
    sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
    sqlContext.setConf("hive.execution.engine", "spark")
    sqlContext.setConf("hive.vectorized.execution.enabled", "true")
    sqlContext.setConf("hive.vectorized.execution.reduce.enabled", "true")

    delta_columns = [
        "delta_acct_nbr", "delta_account_sk_id", "delta_zip_code",
        "delta_primary_state", "delta_eff_start_date", "delta_eff_end_date",
        "delta_load_tm", "delta_hash_key", "delta_eff_flag"
    ]
    hist_columns = [
        "acct_nbr", "account_sk_id", "zip_code", "primary_state",
        "eff_start_date", "eff_end_date", "load_tm", "hash_key", "eff_flag"
    ]
Beispiel #14
0
    df.insertInto(tableName, overwrite)


if __name__ == '__main__':
    #log.debug("debug")
    #a = eval("(1,[2,3])")
    #print "xxxxxxx",a[1][0]
    #a = {1: 1.0, 3: 5.5}
    #str_a = str(a)
    #a = eval(str_a)
    #print a[1]

    #print json.loads("""{1:1}""")
    sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature")
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex_test """)
    sql_context.setConf("spark.sql.shuffle.partitions", "1")

    ldict = [{
        "symbol": "AAA",
        "date": "2010-01-01",
        "close": 1.0
    }, {
        "symbol": "AAA",
        "date": "2010-01-01",
        "close": 1.0
    }]

    df = sql_context.createDataFrame(ldict)
    dfToTableWithPar(sql_context, df, "test_eod_AAA")