Exemple #1
0
def Base_Spark(name=None, config=None, context=False):
    """
    Get spark object.
    :param name: string. The name of the spark task.
    :param config: dict. The config of the spark task.
    :param context: boole. 如果想拿到sqlContext就给True.
    :return: spark object.
    """
    conf = SparkConf()

    if config:
        for k, v in config.items():
            conf.set(k, v)
    else:
        config = {}

    sc = SparkContext(conf=conf, appName=name if name else None)
    sc.setLogLevel("WARN")
    sqlContext = HiveContext(sc)

    if config:
        for k, v in config.items():
            if 'hive.' in k:
                sqlContext.setConf(k, v)
    if context:
        return sqlContext
    else:
        spark = sqlContext.sparkSession
        return spark
Exemple #2
0
def get_context_test():
    conf = SparkConf()
    sc = SparkContext('local[1]', conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("""use fex_test""")
    sql_context.setConf("spark.sql.shuffle.partitions", "1")
    return sc, sql_context
Exemple #3
0
def get_context_test():
    conf = SparkConf()
    sc = SparkContext('local[1]', conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("""use fex_test""")
    sql_context.setConf("spark.sql.shuffle.partitions", "1")
    return sc, sql_context
def write_to_hive(time, rdd):
    def process_row(x):
        row_dict = dict()
        row_dict["timestamp"] = 0 if "timestamp" not in x else x["timestamp"]
        row_dict["source_type"] = "" if "source.type" not in x else x["source.type"]
        row_dict["user_name"] = "" if "src_user_name" not in x else x["src_user_name"]
        row_dict["entity_name"] = "" if "ip_src_addr" not in x else x["ip_src_addr"]
        row_dict["guid"] = "" if "guid" not in x else x["guid"]
        row_dict["alert_score"] = 0.0 if "alert_score" not in x else x["alert_score"]
        row_dict["alerts"] = "" if "alerts" not in x else x["alerts"]
        row_dict["y"] = 0 if "y" not in x else x["y"]
        row_dict["m"] = None if "m" not in x else x["m"]
        row_dict["d"] = None if "d" not in x else x["d"]
        for numerical_colname in EVENT_MODEL_NUMERICAL_COLUMNS:
            row_dict[numerical_colname] = 0.0 if numerical_colname not in x else float(x[numerical_colname])
        for categorical_colname in EVENT_MODEL_CATEGORICAL_COLUMNS:
            row_dict[categorical_colname] = "" if categorical_colname not in x else str(x[categorical_colname])

        row = Row(**row_dict)

        return row

    try:
        spark = SparkSession \
            .builder \
            .appName("event-anomaly-online-score") \
            .enableHiveSupport() \
            .getOrCreate()
        hive_context = HiveContext(spark.sparkContext)
        hive_context.setConf("hive.exec.dynamic.partition", "true")
        hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
        row_rdd = rdd.map(process_row)
        sdf = hive_context.createDataFrame(row_rdd)
        sdf = sdf.drop_duplicates(subset=["guid"])
        sdf.cache()
        source_type_list = [TENANT_NAME + "_" + data_source for data_source in DATA_SOURCE_LIST]
        model_dict = dict()
        for data_source in DATA_SOURCE_LIST:
            model_dict[data_source] = load_event_anomaly_model(spark=spark, data_source=data_source)

        for source_type in source_type_list:
            sdf_source = sdf.filter(sdf.source_type == source_type)
            if not sdf_source.rdd.isEmpty():
                sdf_source.cache()
                database = source_type.split("_")[0]
                data_source = source_type.split("_")[1]
                table = data_source + "_event_alert_score"
                sdf_source.show(3)
                eas_sdf = get_event_anomaly_score(data_source=data_source, model_dict=model_dict,
                                                  input_df=sdf_source)
                result_sdf = sdf_source.join(eas_sdf.select(["guid", "EAS"]), on="guid", how="left")
                result_sdf = result_sdf.na.fill(0.0, subset=["EAS"])
                result_sdf.show(3)
                result_sdf.select("guid", "timestamp", "user_name", "entity_name", "source_type", "alerts",
                                  "alert_score",
                                  "EAS", "y", "m", "d").write.insertInto(database + "." + table)

    except Exception as e:
        pass
Exemple #5
0
def create_context():
    # Creates the Spark context
    sc = SparkContext(appName="hdfs2hive-test")

    # Creates the Hive context
    hiveContext = HiveContext(sc)
    hiveContext.setConf('hive.exec.dynamic.partition.mode', 'nonstrict')
    return sc, hiveContext
    def process(time, rdd):
        print("========= %s =========" % str(time))

        try:
            sqlContext = HiveContext(sc)
            # FIX: memory error Spark 2.0 bug ( < 2.0 )
            sqlContext.setConf("spark.sql.tungsten.enabled","false")

            # v2.01 spark = SparkSession.builder \
            #.master("local") \
            #.appName("Word Count") \
            #.config("spark.some.config.option", "some-value") \
            #.getOrCreate()
            # Get the singleton instance of SparkSession
            #nzs v1.0 spark = getSparkSessionInstance(rdd.context.getConf())

            if rdd.count() < 1:
                return;

            # Convert RDD[String] to RDD[Row] to DataFrame
            sqlRdd = rdd.map( lambda x: json.loads(x)).map(lambda r: Row( metrics=r["metrics"], name=r["name"], value=r["value"] ) )
            wordsDataFrame = sqlContext.createDataFrame(sqlRdd)
            wordsDataFrame.show()
            # Creates a temporary view using the DataFrame.			
            wordsDataFrame.registerTempTable("starwarstemp")
            # Creates a query and get the alam dataset using the temp table 
            wordCountsDataFrame = sqlContext.sql("select * from  starwarstemp")
            wordCountsDataFrame.printSchema()


            with open(SparkFiles.get('webinar_streaming.sql')) as test_file:
                alertsql=test_file.read()
                #logging.info(alertsql)

            alertDataFrame = sqlContext.sql(alertsql)			
            alertDataFrame.show()
            alertDataFrame.printSchema()			

            # save all values to HBASE 
            # IF NEED FILTER LATER .filter(lambda x: str(x["metrics"])=='action-credit-limit') \
            # create HBASE mapper 
            rowRdd = rdd.map( lambda x: json.loads(x))\
                .map(lambda r: ( str(r["metrics"]) ,[ str(r["name"])+"-"+datetime.datetime.now().strftime("%Y%m%d%H%M%S"), "action" if str(r["metrics"])=="action-credit-limit" else  "healt", str(r["metrics"]), str(r["value"])] ))
            
            table = 'starwarsinbox'
            host = 'node-master2-KcVkz'
            keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
            valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
            conf = {"hbase.zookeeper.quorum": host,
            "hbase.mapred.outputtable": table,
            "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
            "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
            "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}
            rowRdd.saveAsNewAPIHadoopDataset(conf=conf,keyConverter=keyConv,valueConverter=valueConv)
        except Exception as merror:
            print (merror)
            raise
Exemple #7
0
def get_context():
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName="__file__", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("""use fex""")
    sql_context.setConf("spark.sql.shuffle.partitions", "32")
    return sc, sql_context
def write_spark_df_to_hdfs(spark, output_schema, database, table_name, sdf,
                           timestamp):
    hive_context = HiveContext(spark.sparkContext)
    hive_context.setConf("hive.exec.dynamic.partition", "true")
    hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict")

    sdf = sdf.withColumn("execution_timestamp", lit(timestamp))
    sdf = sdf.na.fill(0.0,
                      subset=["pas_kmeans", "pas_isolation", "pas_svm", "pas"])
    sdf.select(output_schema).write.insertInto(database + "." + table_name)
Exemple #9
0
def get_context():
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName="__file__", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("""use fex""")
    sql_context.setConf("spark.sql.shuffle.partitions", "32")
    return sc, sql_context
Exemple #10
0
def init_session(config,
                 app=None,
                 return_context=False,
                 overrides=None,
                 use_session=False):
    import os
    from pyhocon import ConfigFactory, ConfigParser

    if isinstance(config, str):
        if os.path.exists(config):
            base_conf = ConfigFactory.parse_file(config, resolve=False)
        else:
            base_conf = ConfigFactory.parse_string(config, resolve=False)
    elif isinstance(config, dict):
        base_conf = ConfigFactory.from_dict(config)
    else:
        base_conf = config

    if overrides is not None:
        over_conf = ConfigFactory.parse_string(overrides)
        conf = over_conf.with_fallback(base_conf)
    else:
        conf = base_conf
        ConfigParser.resolve_substitutions(conf)

    res = init_spark(conf, app, use_session)

    if use_session:
        return res
    else:
        mode_yarn = conf['spark-prop.spark.master'].startswith('yarn')

        if mode_yarn:
            from pyspark.sql import HiveContext
            sqc = HiveContext(res)

            if 'hive-prop' in conf:
                for k, v in prop_list(conf['hive-prop']).items():
                    sqc.setConf(k, str(v))
        else:
            from pyspark.sql import SQLContext
            sqc = SQLContext(res)

        if return_context:
            return res, sqc
        else:
            return sqc
Exemple #11
0
def write_to_hive(spark, sdf, data_source_list):
    try:
        hive_context = HiveContext(spark.sparkContext)
        hive_context.setConf("hive.exec.dynamic.partition", "true")
        hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
        database = TENANT_NAME
        table = "anomaly_score"
        # sdf.show()
        columns = ["name", "type", "time_window", "timestamp"] + [
            "as_" + data_source for data_source in data_source_list
        ] + ["score", "y", "m", "d"]

        sdf.select(columns).write.insertInto(database + "." + table)

    except Exception as e:
        print(str(e))
        pass
def create_hive_session(app_name):
    # Import SparkConf & SparkContext lib
    from pyspark import SparkConf, SparkContext

    # Import SparkSession lib required to create hive context
    logger.debug("Importing pyspark.sql.HiveContext")
    from pyspark.sql import HiveContext
    logger.debug("pyspark.sql.HiveContext imported")

    # Create a spark context
    logger.debug("Creating hive context....")
    conf = SparkConf().setAppName(app_name)
    sc = SparkContext(conf=conf)

    # Return a hive context to the function caller
    hc = HiveContext(sc)
    hc.setConf("hive.metastore.uris", config["HIVE"]["hive.metastore.uris"])
    return hc
Exemple #13
0
def main(args=None):
    def create():
        database.create_database(hc=hc, json_config=json_config)
        trades.createTableContracts(hc=hc, json_config=json_config)
        products.createTableProducts(hc=hc, json_config=json_config)

    def delete():
        trades.deleteTableContracts(hc=hc, json_config=json_config)
        products.deleteTableProducts(hc=hc, json_config=json_config)
        database.delete_database(hc=hc, json_config=json_config)

    args = args_parser.parse_arguments()
    json_config = util_functions.load_json_config(args.json_config)

    sc = SparkContext.getOrCreate()
    hc = HiveContext(sc)
    hc.setConf("hive.exec.dynamic.partition", "true")
    hc.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
    hc.setConf("spark.sql.hive.convertMetastoreOrc", "false")

    if args.action == "create":
        # delete()
        create()

    elif args.action == "delete":
        delete()
    def create_dataframe_from_hive(spark_session, dbConnectionParams):
        df = None
        try:
            sc = SparkSession.builder.appName("Testing").config(
                conf=SparkConf()).enableHiveSupport().getOrCreate()
            sqlContext = HiveContext(sc)
            sqlContext.setConf(
                "hive.metastore.uris",
                "thrift://{}:{}".format(dbConnectionParams.get("host"),
                                        dbConnectionParams.get("port")))

            tdf = sqlContext.sql("show databases")
            tdf.show()

            schema = DataLoader.get_db_name(dbConnectionParams)
            table_name = dbConnectionParams.get("tablename")
            df = sqlContext.table(".".join([schema, table_name]))

        except Exception as e:
            print("couldn't connect to hive")
            raise e
        return df
Exemple #15
0
def write_to_hive(spark, rdd, key, time_window, timestamp, data_source):
    def process_line(x):
        row_dict = dict()
        row_dict["name"] = x[key]
        row_dict["type"] = key
        row_dict["time_window"] = time_window
        row_dict["timestamp"] = timestamp
        row_dict["pas_kmeans"] = x["pas_kmeans"]
        row_dict["pas_isolation"] = x["pas_isolation"]
        row_dict["pas_svm"] = x["pas_svm"]
        row_dict["pas"] = x["pas"]
        row_dict["d"] = d
        row_dict["m"] = m
        row_dict["y"] = y
        row = Row(**row_dict)
        return row

    try:
        hive_context = HiveContext(spark.sparkContext)
        hive_context.setConf("hive.exec.dynamic.partition", "true")
        hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
        database = TENANT_NAME
        table = data_source + "_profile_score"
        date = datetime.datetime.fromtimestamp(timestamp / 1000.0)
        d = date.day
        m = date.month
        y = date.year
        row_rdd = rdd.map(lambda x: process_line(x))
        sdf = spark.createDataFrame(row_rdd)
        sdf = sdf.na.fill(
            0.0, subset=["pas_kmeans", "pas_isolation", "pas_svm", "pas"])
        sdf.select("name", "type", "time_window", "timestamp", "pas_kmeans",
                   "pas_isolation", "pas_svm", "pas", "y", "m",
                   "d").write.insertInto(database + "." + table)

    except Exception as e:
        print(str(e))
        pass
Exemple #16
0
def main(sc,load_id):
	sqlContext = HiveContext(sc)
	
	emp_table = sqlContext.sql("select emp_id,emp_name,emp_dept,gender,division from emp_table")
	emp_table.createOrReplaceTempView("emp_df")
	movies_watched = sqlContext.sql("select emp_id as emp_idm,movie_name from movies_table")
	movies_watched.createOrReplaceTempView("movies_df")
	
	movies_df.persist(StorageLevel.MEMORY_AND_DISK)  ## Persisting movies dataframe
	
	joined_df = emp_df.alias('v1'),join(movies_df.alias('v2'),col('v1.emp_id') == col('v2.emp_idm'),inner).select(col('v1.emp_id'),col('v1.dept_name') \
	,col('v2.movie_name'))
	
	joined_df_final = joined_df.repartition(len(joined_df.select(col('emp_dept')).distinct().collect()))  ## repartioning on the distinct dept_name
	joined_df_final.createOrReplaceTempView('temp_table')
	
	sqlContext.setConf("hive.merge.mapredfiles=false")
	sqlContext.setConf("hive.merge.smallfiles.avgsize=16000000")
	sqlContext.setConf("hive.execution.engine=mr")
	final_sql = 'INSERT OVERWRITE TABLE target_table select * from temp_table'
	sqlContext.sql(final_sql)
import os
import traceback
import sys
from datetime import date, datetime, timedelta
import time
import datetime

from pyspark.sql.functions import *
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext

conf = SparkConf().setAppName('retail_usecase_processing')
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)

sqlContext.setConf("spark.sql.shuffle.partitions", "10")
sqlContext.setConf("hive.exec.dynamic.partition", "true")
sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")

DB_NAME = "retail"
#DB_NAME = sys.argv[0]
date_difference_in_days = 1
#date_difference_in_days = sys.argv[1]
PARTITION_DATE = (datetime.datetime.now() -
                  timedelta(days=date_difference_in_days)).strftime('%Y-%m-%d')
#PARTITION_MONTH = (datetime.datetime.now() - timedelta(months=1)).strftime('%Y-%m')
PARTITION_MONTH = (date.today().replace(day=1) -
                   timedelta(days=1)).strftime('%Y-%m')

# Loading categories table
categories = sqlContext.sql("""select * from {0}.categories""".format(DB_NAME))
#!/usr/bin/env Python
#coding=utf-8
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext
import pandas as pd
import numpy as np

# initialize spark
conf = SparkConf().setMaster('local').setAppName('testApp')
sc = SparkContext(conf=conf)

hiveCtx = HiveContext(sc)
hiveCtx.setConf("hive.exec.orc.split.strategy", "ETL")

orcfile = "hdfs:///user/hive/warehouse/answer/2017-04/answer__dc8f5871_82c0_42e8_ab39_21b57b5a663a"
df = hiveCtx.read.orc(orcfile)
df.show()
Exemple #19
0
def merge_data (sc,table_name):
    print ("Entered Merged data Function Testing")

    sqlContext = HiveContext(sc)
    config = read_config(['/apps/incremental/hdp/environ.properties'])


    input_schema_name, input_table_name = table_name.split('.')

    if(config == None):
        print "Configuration Entry Missing"
        sys.exit(1)

    # get the current branch (from local.properties)
    env            = config.get('branch','env')

    # proceed to point everything at the 'branched' resources
    dbUrl                      = config.get(env+'.mysql','dbUrl')
    dbUser                     = config.get(env+'.mysql','dbUser')
    dbPwd                      = base64.b64decode(config.get(env+'.mysql','dbPwd'))
    dbMetastore_dbName         = config.get(env+'.mysql','dbMetastore_dbName')
    dbApp_dbName               = config.get(env+'.mysql','dbApp_dbName')
    bucket_name                = config.get(env+'.s3','bucket_name')

    print (dbUrl,",",dbUser,",",dbPwd,",",dbMetastore_dbName,",",dbApp_dbName)
    # Connection to the Hive Metastore to get column and partition list
    connection     =  mysql.connector.connect(user=str(dbUser),password=str(dbPwd),host=str(dbUrl),database=str(dbApp_dbName))

    # Get control table access
    try:
        cursor     = connection.cursor()
        merge_sql = "SELECT * FROM application.control_table WHERE target_schemaname = '" + input_schema_name + "' and target_tablename = '" + input_table_name + "'"
        print merge_sql
        cursor.execute(merge_sql)

        control     = cursor.fetchall()
    except Exception as e:
        print 'Issue connectining to metadata database:', e
    finally:
        connection.close()

    control_list = list(chain.from_iterable(control))

    if  not control_list:
        print "Control Entry missing in table"
        sys.exit(1)

    source_schema                     = str(control_list[1])
    source_tablename                  = str(control_list[2])
    target_schema                     = str(control_list[3])
    target_tablename                  = str(control_list[4])
    partitioned                       = control_list[5]
    load_type                         = str(control_list[6])
    s3_backed                         = control_list[7]
    first_partitioned_column          = str(control_list[8])
    second_partitioned_column         = str(control_list[9])
    partitioned_column_transformation = str(control_list[10])
    custom_sql                        = str(control_list[11])
    join_columns                      = str(control_list[12])
    archived_enabled                  = control_list[13]
    distribution_columns              = str(control_list[18])
    dist_col_transformation           = str(control_list[19])

    print distribution_columns, dist_col_transformation

    # Connection to the Hive Metastore to get column and partition list
    connection     =  mysql.connector.connect(user=dbUser, password=dbPwd,host=dbUrl,database=dbMetastore_dbName)

    # Establish connection to the hive metastore to get the list of columns
    try:
        cursor     = connection.cursor()
        #cursor.execute("""SELECT COLUMN_NAME, TBL_NAME FROM COLUMNS_V2 c JOIN TBLS a ON c.CD_ID=a.TBL_ID where a.TBL_ID = 52""")
        #cursor.execute("""SELECT COLUMN_NAME FROM COLUMNS_V2 c JOIN TBLS a ON c.CD_ID=a.TBL_ID where a.TBL_ID = 52""")
        sql_query = "SELECT                                                                   \
                            c.COLUMN_NAME                                                     \
                    FROM                                                                      \
                        TBLS t                                                                \
                        JOIN DBS d                                                            \
                            ON t.DB_ID = d.DB_ID                                              \
                        JOIN SDS s                                                            \
                            ON t.SD_ID = s.SD_ID                                              \
                        JOIN COLUMNS_V2 c                                                     \
                            ON s.CD_ID = c.CD_ID                                              \
                    WHERE                                                                     \
                        TBL_NAME = " + "'" + target_tablename + "' " + "                      \
                        AND d.NAME=" + " '" + target_schema + "' " +  "                       \
                        ORDER by c.INTEGER_IDX"

        cursor.execute(sql_query)
        target_result     = cursor.fetchall()


        sql_query = "SELECT                                                                   \
                            c.COLUMN_NAME                                                     \
                    FROM                                                                      \
                        TBLS t                                                                \
                        JOIN DBS d                                                            \
                            ON t.DB_ID = d.DB_ID                                              \
                        JOIN SDS s                                                            \
                            ON t.SD_ID = s.SD_ID                                              \
                        JOIN COLUMNS_V2 c                                                     \
                            ON s.CD_ID = c.CD_ID                                              \
                    WHERE                                                                     \
                        TBL_NAME = " + "'" + source_tablename + "' " + "                      \
                        AND d.NAME=" + " '" + source_schema + "' " +  "                       \
                        ORDER by c.INTEGER_IDX"

        cursor.execute(sql_query)
        source_result     = cursor.fetchall()


    except Exception as e:
        print 'Issue running SQL in hive metadata database:', e
        raise
    finally:
        connection.close()

    # Get the column on which the table is partitioned
    source_select_list           = ', '.join(map(''.join,source_result))
    target_select_list           = ', '.join(map(''.join,target_result))

    if not source_select_list:
        print "Hive Table Not Found in metadata database"
        sys.exit(1)
    # Create the SELECT query string for fetching data from the external table
    if len(dist_col_transformation) > 0 :
        target_select_list           = target_select_list
        source_select_list           = source_select_list + ' , ' + dist_col_transformation

    if (partitioned):
        incremental_sql_query = 'select ' + source_select_list + ', ' + partitioned_column_transformation + ' from ' + source_schema + '.' + source_tablename
        if second_partitioned_column <> 'None':
            target_sql_query      = 'select ' + target_select_list + ', ' + first_partitioned_column + ', ' + second_partitioned_column + ' from ' + target_schema + '.' + target_tablename
        else:
            target_sql_query      = 'select ' + target_select_list + ', ' + first_partitioned_column + ' from ' + target_schema + '.' + target_tablename
    else:
        incremental_sql_query = 'select ' + source_select_list + ' from ' + source_schema + '.' + source_tablename
        target_sql_query      = 'select ' + target_select_list + ' from ' + target_schema + '.' + target_tablename

    connection     =  mysql.connector.connect(user=dbUser, password=dbPwd,host=dbUrl,database=dbMetastore_dbName)
    try:
        cursor     = connection.cursor()
        #cursor.execute("""SELECT COLUMN_NAME, TBL_NAME FROM COLUMNS_V2 c JOIN TBLS a ON c.CD_ID=a.TBL_ID where a.TBL_ID = 52""")
        #cursor.execute("""SELECT COLUMN_NAME FROM COLUMNS_V2 c JOIN TBLS a ON c.CD_ID=a.TBL_ID where a.TBL_ID = 52""")
        bloom_sql_query = "SELECT e.PARAM_VALUE                                                 \
                           FROM                                                                 \
                                TBLS t                                                          \
                           JOIN DBS d                                                           \
                              ON t.DB_ID = d.DB_ID                                              \
                           LEFT OUTER JOIN TABLE_PARAMS e                                       \
                                ON t.TBL_ID = e.TBL_ID                                          \
                                AND PARAM_KEY = 'orc.bloom.filter.columns'                      \
                           WHERE                                                                \
                                TBL_NAME = " + "'" + target_tablename + "' " + "                \
                                AND d.NAME=" + " '" + target_schema + "' "                      \

        cursor.execute(bloom_sql_query)
        bloom_filter      = cursor.fetchall()
    except Exception as e:
        print 'Issue running SQL in hive metadata database:', e
        raise
    finally:
        connection.close()

    bloom_filters_columns = ''
    if len(bloom_filter) > 1:
        bloom_filters_columns = ','.join(map(''.join,bloom_filter))
        bloom_filter_list = bloom_filters_columns.split ","
    # Execute the query to get the data into Spark Memory

    # Figure out if it is incremental or full load process
    # If Full Load then truncate the target table and insert the entire incoming data
    # If Incremental Load then determine if the table is partitioned as the logic needs to be handled differently for partitioned and non-partitioned tables
    #      For Non Partitioned Tables merge the incoming data with the table date and save it to the database
    #      For Partitioned Tables identify the partitions for which there is incremental data and intelligently merge the data and save it to the database table
    table_name = target_schema + '.' + target_tablename
    sqlContext.setConf("hive.exec.dynamic.partition", "true")
    sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
    sqlContext.setConf("spark.sql.orc.filterPushdown", "true")
    sqlContext.setConf("mapred.input.dir.recursive", "true")
    sqlContext.setConf("hive.mapred.supports.subdirectories", "true")

    sc._jsc.hadoopConfiguration().set('fs.s3a.attempts.maximum','30')

    if s3_backed:
         path = 's3a://' + bucket_name + '/' + target_schema + '/' + target_tablename + '/'
    else:
         path = '/apps/hive/warehouse/'  + target_schema + '.db/' + target_tablename + '/'

    if second_partitioned_column <> 'None':
        partitioned_columns = first_partitioned_column + second_partitioned_columns
    else:
        partitioned_columns = first_partitioned_column

    if len (distribution_columns) > 0 :
       bucket_columns = partitioned_columns + distribution_columns
       bucket_column_list = bucket_columns.split(",")
    else:
       bucket_columns = partitioned_columns
       bucket_column_list = bucket_columns.split(",")

    if len (partitioned_columns) > 0:
       partition_column_list = partitioned_columns.split(",")

    from pyspark.sql.functions import col
    try:
####################################################################################################################################################################
# Below logic is to sort the data based on the bloom filter columns across multiple tasks. This is the most optimal way for storing data for efficient             #
# reads but it takes a lot of time to load the data as the data is stored one partition at a time. We can speed up the process by persisting data but then if the  #
# partitions are not equally sized there are chances of shuffle reads crossing 2GB limit causing MAX_INT error.                                                    #
#                                                                                                                                                                  #
# Solution for Spark 2.1 : To sort the data by task so that the performance would be better than not sorting but less efficient than the below process             #
# Solution for Spark 2.2 : Use the sortBy API being introduced                                                                                                     #
#                                                                                                                                                                  #
# The reason for commenting out the code instead of removing is to prove that the logic can be implemented technically in prior versions of Spark but it is very   #
# inefficient                                                                                                                                                      #
####################################################################################################################################################################
                    # first_partitioned_list = final_df.select(first_partitioned_column) \
                    #                         .rdd.flatMap(lambda x: x).distinct().collect()
                    # if second_partitioned_column <> 'None':
                    #     second_partitioned_list = final_df.select(second_partitioned_column)\
                    #                         .rdd.flatMap(lambda x: x).distinct().collect()
                    # #final_df.persist()
                    # if second_partitioned_column <> 'None':
                    #     for first_partition in first_partitioned_list:
                    #         for second_partition in  second_partitioned_list:
                    #             final_path = path + first_partitioned_column + '=' + format(first_partition) + '/' +  \
                    #                     second_partitioned_column + '=' + format(second_partition)
                    #             write_df = final_df.where(col(first_partitioned_column).isin(format(first_partition)) &
                    #                                       col(second_partitioned_column).isin(format(second_partition)))
                    #             save_df = write_df.drop(first_partitioned_column).drop(second_partitioned_column)
                    #             save_df.write.option("compression","zlib").mode("overwrite").format("orc").save(path)
                    # else:
                    #     for first_partition in first_partitioned_list:
                    #         final_path = path + first_partitioned_column + '=' + format(first_partition)
                    #         print path
                    #         write_df = final_df.where(col(first_partitioned_column).isin(format(first_partition)))
                    #         save_df = write_df.drop(first_partitioned_column)
                    #         save_df.write.option("compression","zlib").mode("overwrite").format("orc").save(final_path)
        if load_type == 'FULL':
            merge_df = sqlContext.sql(incremental_sql_query)
            if partitioned:
                if bloom_filters_columns:
                    final_df = merge_df.repartition(len(merge_df.select(bucket_column_list).distinct().collect()),bucket_column_list) \
                               .sortWithinPartitions(bloom_filter_list)
                else:
                    final_df = merge_df.repartition(len(merge_df.select(bucket_column_list).distinct().collect()),bucket_column_list)
                final_df.write.option("compression","zlib").mode("overwrite").format("orc").partitionBy(partition_column_list).save(path)
            else:
                if merge_df.rdd.getNumPartitions() > 300:
                    merge_coalesce_df = merge_df.coalesce(300)
                else:
                    merge_coalesce_df = merge_df
                if bloom_filters_columns:
                    save_df = merge_coalesce_df.sortWithinPartitions(bloom_filters_columns)
                    save_df.write.option("compression","zlib").mode("overwrite").format("orc").save(path)
                else:
                    merge_coalesce_df.write.option("compression","zlib").mode("overwrite").format("orc").save(path)
        # Incremental Logic for Append Only table especially for S3
        elif load_type == 'APPEND_ONLY':
            merge_df = sqlContext.sql(incremental_sql_query)
            if s3_backed:
                temp_table = target_tablename + '_tmp'
                temp_path  = '/apps/hive/warehouse/'  + target_schema + '.db/' + temp_table + '/'
            else:
                temp_path  = path
            print temp_path
            if  partitioned:
                if bloom_filters_columns:
                    final_df = merge_df.repartition(len(merge_df.select(bucket_column_list).distinct().collect()),bucket_column_list) \
                               .sortWithinPartitions(bloom_filter_list)
                else:
                    final_df = merge_df.repartition(len(merge_df.select(bucket_column_list).distinct().collect()),bucket_column_list)
                final_df.write.option("compression","zlib").mode("append").format("orc").partitionBy(partition_column_list).save(temp_path)
            else:
                if merge_df.rdd.getNumPartitions() > 300:
                    merge_coalesce_df = merge_df.coalesce(300)
                else:
                    merge_coalesce_df = merge_df
                if bloom_filters_columns:
                    save_df = merge_coalesce_df.sortWithinPartitions(bloom_filters_columns)
                    save_df.write.option("compression","zlib").mode("append").format("orc").save(temp_path)
                else:
                    merge_coalesce_df.write.option("compression","zlib").mode("append").format("orc").save(temp_path)
            if s3_backed:
                target_path = 's3a://' + bucket_name + '/' + target_schema + '/' + target_tablename
                source_path = 'hdfs://getnamenode/apps/hive/warehouse/'  + target_schema + '.db/' + target_tablename + '_tmp' + '/*'
                print source_path
                print target_path
                (ret, out, err) = run_cmd(['hadoop', 'distcp', source_path, target_path])
                (ret, out, err) = run_cmd(['hadoop','fs', '-rm','-r',source_path])
        else:
            if (partitioned):
                from pyspark.sql.functions import col
                incremental_df = sqlContext.sql(incremental_sql_query)

                first_partitioned_list = incremental_df.select(first_partitioned_column) \
                                        .rdd.flatMap(lambda x: x).distinct().collect()
                if second_partitioned_column <> 'None':
                    second_partitioned_list = incremental_df.select(second_partitioned_column)\
                                        .rdd.flatMap(lambda x: x).distinct().collect()
                    merge_df           = sqlContext.sql(target_sql_query)\
                                        .where(col(first_partitioned_column).isin(first_partitioned_list) & \
                                               col(second_partitioned_column).isin(second_partitioned_list))
                else:
                    merge_df           = sqlContext.sql(target_sql_query) \
                                        .where(col(first_partitioned_column).isin(first_partitioned_list))
                join_column_list       = join_columns.split(",")
                output_df              = merge_df.join(incremental_df,join_column_list,"leftanti")
                final_df               = output_df.union(incremental_df)
                if bloom_filters_columns:
                    save_df = final_df.repartition(len(final_df.select(bucket_column_list).distinct().collect()),bucket_column_list) \
                               .sortWithinPartitions(bloom_filter_list)
                else:
                    save_df = final_df.repartition(len(final_df.select(bucket_column_list).distinct().collect()),bucket_column_list)

                save_df.persist()
                save_df.count()

                #final_df.persist()
                if second_partitioned_column <> 'None':
                    for first_partition in first_partitioned_list:
                        for second_partition in  second_partitioned_list:
                            final_path = path + first_partitioned_column + '=' + format(first_partition) + '/' +  \
                                    second_partitioned_column + '=' + format(second_partition)
                            write_df = save_df.where(col(first_partitioned_column).isin(format(first_partition)) &
                                                      col(second_partitioned_column).isin(format(second_partition)))
                            out_df.write.option("compression","zlib").mode("overwrite").format("orc").partitionBy(partition_column_list).save(path)
                else:
                    for first_partition in first_partitioned_list:
                        final_path = path + first_partitioned_column + '=' + format(first_partition)
                        print path
                        write_df = save_df.where(col(first_partitioned_column).isin(format(first_partition)))
                        out_df.write.option("compression","zlib").mode("overwrite").format("orc").partitionBy(partition_column_list).save(final_path)
        # Incremental Update of non-partitioned table
            else:
                incremental_df = sqlContext.sql(incremental_sql_query)
                current_df = sqlContext.sql(target_sql_query)

                join_column_list       = join_columns.split(",")
                output_df              = current_df.join(incremental_df,join_column_list,"leftanti")

                merge_df = output_df.union(incremental_df)
                merge_df.persist()
                merge_df.count()

                if merge_df.rdd.getNumPartitions() > 300:
                    merge_coalesce_df = merge_df.coalesce(300)
                else:
                    merge_coalesce_df = merge_df
                if bloom_filters_columns:
                    save_df = merge_coalesce_df.sortWithinPartitions(bloom_filters_columns)
                    save_df.write.option("compression","zlib").mode("overwrite").format("orc").save(path)
                else:
                    merge_coalesce_df.write.option("compression","zlib").mode("overwrite").format("orc").save(path)

        if (partitioned):
            repair_table_sql = 'MSCK REPAIR TABLE ' + table_name
            sqlContext.sql(repair_table_sql)

        refresh_metadata_sql = 'REFRESH TABLE ' + table_name
        sqlContext.sql(refresh_metadata_sql)
        sqlContext.sql(refresh_metadata_sql)

    except Exception as e:
        print 'Exception while loading data:', e # coding=utf-8
        sys.exit(1)

    if archived_enabled:
        target_path = 's3a://' + bucket_name + '/' + target_schema + '/' + target_tablename + '_bkp/'
        if  s3_backed:
            source_path = 's3a://' + bucket_name + '/' + source_schema + '/' + source_tablename + '/'
        else:
            source_path = 'hdfs://apps/hive/warehouse/'  + source_schema + '.db/' + source_tablename + '/*'
        print source_path
        print target_path
        (ret, out, err) = run_cmd(['hadoop', 'distcp', source_path, target_path])
        print "Errors:",err
Exemple #20
0
                outlier_mean.astype(str), z_score.astype(str),
                normal_std.astype(str), outlier_std.astype(str)))
        res = '\t'.join(['\t'.join(r) for r in res])
        abnormal_features.append('\t'.join(row.astype(str).values.tolist()) +
                                 '\t' + res)
    return abnormal_features


if __name__ == "__main__":
    sparkConf = SparkConf()
    sparkConf.setAppName("dagang abnormal segment")
    sparkConf.set("spark.kryoserializer.buffer.max", "128")
    sc = SparkContext(conf=sparkConf)
    sc.setLogLevel("WARN")
    sqlCtx = HiveContext(sc)
    sqlCtx.setConf("spark.sql.parquet.binaryAsString", "true")
    sqlCtx.setConf("spark.sql.hive.convertMetastoreParquet", "true")
    sqlCtx.setConf("spark.sql.parquet.int96AsTimestamp", "true")

    executor_cores = int(sparkConf.get('spark.executor.cores'))
    num_executors = int(sparkConf.get('spark.executor.instances'))
    num_partitions = executor_cores * num_executors * 3

    features = [
        'area', 'down_oscillation', 'down_stroke', 'down_stroke_ratio',
        'down_stroke_zaihe', 'down_up_oscillation_ratio',
        'down_up_stroke_zaihe_ratio', 'down_up_zaihe_ratio', 'down_zaihe',
        'left_upper_area', 'left_upper_area_ratio', 'max_weiyi',
        'max_weiyi_zaihe', 'max_zaihe', 'min_max_zaihe_ratio', 'min_weiyi',
        'min_weiyi_zaihe', 'min_zaihe', 'up_oscillation', 'up_stroke',
        'up_stroke_ratio', 'up_stroke_zaihe', 'up_zaihe'
Exemple #21
0
def run(yarn=None, verbose=None, campaign=None, tier=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """

    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)

    sqlContext.setConf("spark.sql.files.ignoreCorruptFiles", "true")
    sqlContext.sql("set spark.sql.files.ignoreCorruptFiles=true")

    df = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(header='true', treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load('hdfs:///cms/aggregation/sizes/part-*')

    if campaign != None and tier != None:
        campaign_tier_df = df.where(df.campaign == campaign)\
                             .where(df.tier == tier)

        campaign_tier = map(lambda row: row.asDict(),
                            campaign_tier_df.collect())

        print 'Average size: %s' % bytes_to_readable(
            float(campaign_tier[0]['size_average']))
        print 'Average in period of existence: %s' % bytes_to_readable(
            float(campaign_tier[0]['average_size_in_period']))
        print 'Max size: %s' % bytes_to_readable(
            float(campaign_tier[0]['size_max']))
        print 'T1 size: %s' % bytes_to_readable(
            float(campaign_tier[0]['t1_size']))
        print 'T2 size: %s' % bytes_to_readable(
            float(campaign_tier[0]['t2_size']))
        print 'T3 size: %s' % bytes_to_readable(
            float(campaign_tier[0]['t3_size']))

    date_to_timestamp_udf = udf(lambda date: time.mktime(
        datetime.datetime.strptime(date, "%Y%m%d").timetuple()))

    months = [1, 2, 3, 4, 5, 6, 9, 12]

    for month in months:
        now = (datetime.datetime.now() -
               datetime.datetime(1970, 1, 1)).total_seconds()
        seconds = month * 30 * 24 * 60 * 60
        not_accessed_df = df.withColumn(
            'date_timestamp', date_to_timestamp_udf(df.last_access_date))
        not_accessed_df = not_accessed_df.where(
            now - not_accessed_df.date_timestamp > seconds)
        not_accessed_df = not_accessed_df.withColumn(
            "size_average", not_accessed_df["size_average"].cast(DoubleType()))

        total_size = not_accessed_df.groupBy().sum('size_average').rdd.map(
            lambda x: x[0]).collect()[0] or 0

        print 'Size of data not accessed for last %d month(s): %s' % (
            month, bytes_to_readable(total_size))

    ctx.stop()
    def process(time, rdd):
        print("========= %s =========" % str(time))

        try:
            sqlContext = HiveContext(sc)
            # FIX: memory error Spark 2.0 bug ( < 2.0 )
            sqlContext.setConf("spark.sql.tungsten.enabled", "false")

            if rdd.count() < 1:
                return

            sqlRdd = rdd.map(lambda x: json.loads(x)).map(
                lambda r: Row(messageid=r["messageid"],
                              messagedate=datetime.datetime.strptime(
                                  r["messagedate"], '%Y%m%d%H%M%S'),
                              value=r["value"],
                              metrics=r["metrics"],
                              name=r["name"]))
            speedDataFrame = sqlContext.createDataFrame(sqlRdd)

            batch_table_name = config.get_lambda_config(
                "lambda_speedlayer", "speed_batch_table")
            speedDataFrame.write.mode("append").saveAsTable(batch_table_name)

            # if S3 vals defined then save also to OBS (s3)
            s3_full_path = config.get_lambda_config("lambda_speedlayer",
                                                    "s3_full_path")
            if s3_full_path and False:
                speedDataFrame.write.parquet(s3_full_path, mode="append")

            speedDataFrame.show()
            # Creates a temporary view using the DataFrame.
            temp_table_name = config.get_lambda_config("lambda_speedlayer",
                                                       "speed_temp_table")
            speedDataFrame.registerTempTable(temp_table_name)

            if __debug__:
                speedDataFrame.printSchema()
                speedDataFrame.head(10)

            # handling sql alert file
            alertsqlfile = config.get_lambda_config("lambda_speedlayer",
                                                    "alert_sql_path")

            alertsql = load_resource_file(alertsqlfile)
            # Execute alarm query and get the alam dataset using the temp table
            alertDataFrame = sqlContext.sql(alertsql)
            alertDataFrame.show()
            alertDataFrame.printSchema()

            # save all values to HBASE
            # IF NEED FILTER LATER .filter(lambda x: str(x["metrics"])=='action-credit-limit') \
            # create HBASE mapper
            rowRdd = rdd.map( lambda x: json.loads(x))\
                .map(lambda r: ( str(r["metrics"]) ,[ str(r["name"])+"-"+datetime.datetime.now().strftime("%Y%m%d%H%M%S"), "driver" if "driver" in str(r["metrics"]) else "car", str(r["metrics"]), str(r["value"])  ] ))

            table = config.get_lambda_config("lambda_speedlayer",
                                             "speed_inbox_table")
            host = config.get_lambda_config("lambda_speedlayer", "hbase_host")
            keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
            valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
            conf = {
                "hbase.zookeeper.quorum": host,
                "hbase.mapred.outputtable": table,
                "mapreduce.outputformat.class":
                "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
                "mapreduce.job.output.key.class":
                "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
                "mapreduce.job.output.value.class":
                "org.apache.hadoop.io.Writable"
            }
            rowRdd.saveAsNewAPIHadoopDataset(conf=conf,
                                             keyConverter=keyConv,
                                             valueConverter=valueConv)
        except Exception as streamerror:
            logging.error("Stream error:", streamerror)
            print(streamerror)
            raise
Exemple #23
0
def run(fout, yarn=None, verbose=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)
    
    sqlContext.setConf("spark.sql.files.ignoreCorruptFiles","true")
    sqlContext.sql("set spark.sql.files.ignoreCorruptFiles=true")

    # date, site, dataset, size, replica_date, groupid
    schema = StructType([
        StructField("date", StringType(), True),
        StructField("site", StringType(), True),
        StructField("dataset", StringType(), True),
        StructField("size", DoubleType(), True),
        StructField("replica_date", StringType(), True),
        StructField("groupid", StringType(), True)
    ])

    df = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(PHEDEX_HDFS_URL, schema=schema)
                        
    # Remove all tape sites
    is_tape = lambda site: site.endswith('_MSS') | site.endswith('_Buffer') | site.endswith('_Export')
    df = df.where(is_tape(df.site) == False)

    # Remove all non VALID datasets
    remove_invalid_datasets(df, sqlContext, verbose)

    # Get accesses data frame
    accesses_df = get_dataset_access_dates(sqlContext)

    # extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2].split('-')[0])
    extract_tier_udf = udf(lambda dataset: dataset.split('/')[3])
    days_delta_udf = udf(lambda t1, t2: (datetime.datetime.fromtimestamp(float(t1)) - datetime.datetime.fromtimestamp(float(t2))).days + 1)
    count_udf = udf(lambda list: len(list))
    get_t1_size = udf(lambda size, site: size if site.startswith('T1') else 0)
    get_t2_size = udf(lambda size, site: size if site.startswith('T2') else 0)
    get_t3_size = udf(lambda size, site: size if site.startswith('T3') else 0)

    df = df.withColumn('campaign', get_extract_campaign_udf()(df.dataset))\
           .withColumn('tier', extract_tier_udf(df.dataset))\
           .withColumn('date_min', get_date_to_timestamp_udf()(df.date))\
           .withColumn('date_max', get_date_to_timestamp_udf()(df.date))\
           .withColumn('size_average', df.size)\
           .withColumn('t1_size', get_t1_size(df.size, df.site))\
           .withColumn('t2_size', get_t2_size(df.size, df.site))\
           .withColumn('t3_size', get_t3_size(df.size, df.site))

    df = df.groupBy(['campaign', 'tier'])\
           .agg({'date_min': 'min', 'date_max': 'max', 'date': 'collect_set', 'size_average': 'avg', 'size': 'max', 't1_size': 'avg', 't2_size': 'avg', 't3_size': 'avg'})\
           .withColumnRenamed('min(date_min)', 'date_min')\
           .withColumnRenamed('max(date_max)', 'date_max')\
           .withColumnRenamed('collect_set(date)', 'days_count')\
           .withColumnRenamed('avg(size_average)', 'size_average')\
           .withColumnRenamed('max(size)', 'size_max')\
           .withColumnRenamed('avg(t1_size)', 't1_size')\
           .withColumnRenamed('avg(t2_size)', 't2_size')\
           .withColumnRenamed('avg(t3_size)', 't3_size')\

    df = df.withColumn('period_days', days_delta_udf(df.date_max, df.date_min))\
           .withColumn('days_count', count_udf(df.days_count))\
           .withColumn('date_min', get_timestamp_to_date_udf()(df.date_min))\
           .withColumn('date_max', get_timestamp_to_date_udf()(df.date_max))
        
    df = df.withColumn('existence_in_period', df.days_count / df.period_days)
    df = df.withColumn('average_size_in_period', df.size_average * df.existence_in_period)

    df.show(100, truncate=False)

    # campaign, tier, date_max, date_min, days_count, size_max, size_average, period_days, existence_in_period, average_size_in_period, t1_size, t2_size, t3_size, last_access_date
    df = df.join(accesses_df, 'campaign')
    
    df.show(100, truncate=False)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        df.write.format("com.databricks.spark.csv")\
                          .option("header", "true").save(fout)
    
    ctx.stop()
Exemple #24
0
            adjclose float
            )
            ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    """)

    sqlContext.sql(""" use fex """)
    df = sqlContext.sql("""
    SELECT
        *
    FROM
        eod_spx
    WHERE
        symbol = "SPX"
        AND date >= "2010-01-01"
        AND date <= "2010-06-30"
    """)
    sqlContext.sql(""" use fex_test """)
    df.repartition(1).insertInto("eod_spx", True)


if __name__ == "__main__":
    conf = SparkConf();
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName=__file__, conf = conf)
    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "1")
    main(sc, sqlContext)
    sc.stop()
import sys
import string
from datetime import date, timedelta
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext

from scp import SCPClient
from pyspark.sql import functions as F

from subprocess import call

APP_NAME = "con"
sc = SparkContext("", APP_NAME)
sc.setLogLevel("WARN")
sqlContext = HiveContext(sc)
sqlContext.setConf("spark.sql.parquet.binaryAsString", "true")
PROD = "12.333.201.21"

## Loop through DATE partitions
yesterday = datetime.datetime.now(
    pytz.timezone('US/Central')).date() - timedelta(1)
dayte = str(
    yesterday.strftime("%Y") + '-' + yesterday.strftime("%m") + '-' +
    yesterday.strftime("%d"))


def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

Exemple #26
0
from __future__ import print_function

import os
import sys

from pyspark import SparkContext
from pyspark.sql import SQLContext, HiveContext
from pyspark.sql.readwriter import DataFrameWriter
from pyspark.sql.types import *
from pyspark.sql.types import *

if __name__ == "__main__":
    sc = SparkContext(appName="PythonSQL")
    sqlContext = SQLContext(sc)
    hiveContext = HiveContext(sc)
    hiveContext.setConf("hive.exec.dynamic.partition", "true")
    hiveContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
    hiveContext.setConf("spark.sql.orc.filterPushdown", "true")

    ## Create a DataFrame from the file(s) pointed to by path
    gwcdr = hiveContext.read.json("/user/wgovea/people.json.gz")

    # The inferred schema can be visualized using the printSchema() method.
    gwcdr.printSchema()

    # Register this DataFrame as a table.
    gwcdr.registerTempTable("gwcdr_tmp")

    data = hiveContext.sql(
        "SELECT name,age,country,ts from gwcdr_tmp").write.format(
            "orc").partitionBy("ts").mode("append").insertInto("people")
Exemple #27
0
from pyspark.sql.functions import regexp_replace, col, udf
from langdetect import detect
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LOOKUP as enlook
from spacy.lang.de import LOOKUP as delook
from pyspark.sql.types import StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import explode
from pyspark.sql import HiveContext
import pyspark.sql.functions as func

conf = SparkConf().setAppName('MyFirstStandaloneApp')
sc = SparkContext(conf=conf)
#sqlContext = sql.SQLContext(sc)
hiveContext = HiveContext(sc)
hiveContext.setConf("hive.metastore.uris",
                    "thrift://s12m.westeurope.cloudapp.azure.com:9083")


class WordCount:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def transform(self):
        df2 = self.dataframe.withColumn(
            "_2",
            regexp_replace(col("_2"), "[\"'./§$&+,:;=?@#–|'<>.^*()%!-]", ""))
        df = df2.withColumn("_2", regexp_replace(col("_2"), "\\s{2,}", ""))

        language_detect = udf(lambda x: detect(x), returnType=StringType())
        df3 = df.withColumn("lang", language_detect('_2'))
Exemple #28
0
            adjclose float
            )
            ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    """)

    sqlContext.sql(""" use fex """)
    df = sqlContext.sql("""
    SELECT
        *
    FROM
        eod_spx
    WHERE
        symbol = "SPX"
        AND date >= "2010-01-01"
        AND date <= "2010-06-30"
    """)
    sqlContext.sql(""" use fex_test """)
    df.repartition(1).insertInto("eod_spx", True)


if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName=__file__, conf=conf)
    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "1")
    main(sc, sqlContext)
    sc.stop()
def sql_hive_context_example(spark):
    
    # create hive context object.
    hive_ctx = HiveContext(spark.sparkContext)

    # createDataFrame
    l = [('Alice', 18), ('Bob', 20), ('Charley', 22)]
    df = hive_ctx.createDataFrame(l, ('name', 'age'))
    print("createDataFrame API finished")

    # registerDataFrameAsTable 
    hive_ctx.registerDataFrameAsTable(df, "table1")
    print("registerDataFrameAsTable API finished")

    # sql
    tmp_df = hive_ctx.sql("select * from table1")
    tmp_df.show()
    print("sql API finished")

    # table
    tmp_df = hive_ctx.table("table1")
    tmp_df.show()
    print("table API finished")

    # tableNames
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("tableNames API finished")

    # tables
    tables = hive_ctx.tables()
    print(tables)
    print("tables API finished")

    # range
    tmp_df = hive_ctx.range(1,10,2)
    tmp_df.show()
    print("range API finished")

    # dropTempTable
    hive_ctx.dropTempTable("table1")
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("dropTempTable API finished")

    # cacheTable & uncacheTable & clearCache
    df = hive_ctx.range(1,10,2)
    hive_ctx.registerDataFrameAsTable(df, "table")
    hive_ctx.cacheTable("table")
    hive_ctx.uncacheTable("table")
    hive_ctx.clearCache()
    print("cacheTable & uncacheTable & clearCache API finished")

    # createExternalTable

    # newSession

    # registerFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead

    # registerJavaFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead

    # setConf & getConf
    hive_ctx.setConf("key1", "value1")
    value = hive_ctx.getConf("key1")
    print(value)
    print("setConf & getConf API finished")

    # refreshTable
    # Exception: An error occurred while calling o26.refreshTable:
    # Method refreshTable([class java.lang.String]) does not exist
    
    print("Finish running HiveContext API")
Exemple #30
0
        select * from tmp_table
        """.format(table_name=table_name, dt=dt, version=version)
    print(insert_sql)
    hiveCtx.sql(insert_sql)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--date", help="work date", default="1")
    parser.add_argument("-v", "--version", help="version", default="1")

    args = parser.parse_args()
    print("%s parameters:%s" % (sys.argv[0], args))

    begin_time = time.time()
    print("%s begin at %s" % (sys.argv[0], str(datetime.datetime.now())))

    conf = SparkConf()
    sc = SparkContext(conf=conf, appName="sp_ind")
    sc.setLogLevel("WARN")
    hiveCtx = HiveContext(sc)
    hiveCtx.setConf('spark.shuffle.consolidateFiles', 'true')
    hiveCtx.setConf('spark.sql.shuffle.partitions', '1000')
    hiveCtx.sql('use app')
    dt = '2020-05-25'
    version = 'query-similar-month'
    # Create table.
    create_table()
    #negative_sampling(dt, version)
    postive_sampling(dt, version)
        ddl_str = '%s %s' % (col_name, col_dtype)
        projection_str = '  %s ' % (col_name)
        ddl_list.append(ddl_str)
        select_list.append(projection_str)
ddl_str = """create external table output_table_omtr( %s )   
           stored as parquet 
           location 's3://move-dataeng-temp-dev/glue-etl/parquet_block_poc/hit_data_pdt_512mb_ctas/'
           """ % (','.join(ddl_list))

#---
print 'ddl_str = ', ddl_str
print 'select_str = ', ','.join(select_list)
sqlContext.sql(ddl_str)

df.createOrReplaceTempView("hit_data_big")
sqlContext.setConf("hive.exec.dynamic.partition", "true")
sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")

print df.count()
sqlContext.sql("show tables").show()

#----- Hive section ----
hadoopConf = {}
iterator = sc._jsc.hadoopConfiguration().iterator()
while iterator.hasNext():
    prop = iterator.next()
    hadoopConf[prop.getKey()] = prop.getValue()
for item in sorted(hadoopConf.items()):
    print(item)

for item in sorted(sc._conf.getAll()):
Exemple #32
0
def create_hive_context(spark_context):
    hive_context = HiveContext(spark_context)
    hive_context.setConf('hive.exec.dynamic.partition.mode', 'nonstrict')
    hive_context.setConf('hive.exec.max.dynamic.partitions', '17520')
    return hive_context
Exemple #33
0
Fails after 2+ hours. Problem seems to be "(Too many open files)"
Likely several thousand files are open at one time.


"""


from pyspark import SparkContext
from pyspark.sql import HiveContext

sc = SparkContext()
sqlContext = HiveContext(sc)

# snappy compression recommended for Arrow
# Interesting- snappy is slightly smaller than gz for the 10 rows.
sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy")

# Testing
#pems = sqlContext.sql("SELECT * FROM pems LIMIT 10")

# This works
# pems = sqlContext.sql("SELECT * FROM pems WHERE station IN (402265, 402264, 402263, 402261, 402260)")

pems = sqlContext.sql("SELECT * FROM pems ORDER BY station")

# Don't see options about file chunk sizes, probably comes from some 
# environment variable
# Later versions:
# pems.write.parquet("pems_sorted", compression = "snappy")

#pems.write.parquet("pems_station", partitionBy="station")