def main(): #Configure OPTIONS conf = SparkConf().setAppName("Citi-AddPyFile") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) #Agrego el o los códigos de python adicionales que requiere mi desarrollo sc.addPyFile("hdfs:///data_lake/Spark_Citi/lib/utileriasCiti.py") from utileriasCiti import GuardaMiTabla # In[7]: #Creamos dataframe apartir de un archivo json fileName = "/data_lake/Spark_Citi/config/parametros_citi_destinos.json" data = hiveContext.read.format("json").option('encoding', 'UTF-8').load(fileName) data.show(100,truncate=False)
def main(): # init args = parse_args() sc = SparkContext(conf=SparkConf().setAppName(args.app_name)) sc.setLogLevel("WARN") hiveContext = HiveContext(sc) # read csv data data_df = hiveContext.read.csv(args.path, header=True, inferSchema=True) dest_dict = eval(args.destination) assert type(dest_dict) == dict, "please input str(dict)" save_data.save_df(data_df,dest_dict)
def __init__(self): """ Create a spark context. The spark configuration is taken from xframes/config.ini and from the values set in SparkInitContext.set() if this has been called. """ # This is placed here because otherwise it causes an error when used in a spark slave. from pyspark import SparkConf, SparkContext, SQLContext, HiveContext # This reads from default.ini and then xframes/config.ini # if they exist. self._env = Environment.create() context = create_spark_config(self._env) verbose = self._env.get_config('xframes', 'verbose', 'false').lower() == 'true' hdfs_user_name = self._env.get_config('webhdfs', 'user', 'hdfs') os.environ['HADOOP_USER_NAME'] = hdfs_user_name config_pairs = [(k, v) for k, v in context.iteritems()] self._config = (SparkConf().setAll(config_pairs)) if verbose: print 'Spark Config: {}'.format(config_pairs) self._sc = SparkContext(conf=self._config) self._sqlc = SQLContext(self._sc) self._hivec = HiveContext(self._sc) self.zip_path = [] version = [int(n) for n in self._sc.version.split('.')] self.status_tracker = self._sc.statusTracker() if cmp(version, [1, 4, 1]) >= 0: self.application_id = self._sc.applicationId else: self.application_id = None if verbose: print 'Spark Version: {}'.format(self._sc.version) if self.application_id: print 'Application Id: {}'.format(self.application_id) if not context['spark.master'].startswith('local'): zip_path = self.build_zip(get_xframes_home()) if zip_path: self._sc.addPyFile(zip_path) self.zip_path.append(zip_path) trace_flag = self._env.get_config('xframes', 'rdd-trace', 'false').lower() == 'true' XRdd.set_trace(trace_flag) atexit.register(self.close_context)
def sql(sc): try: if hasattr(pyspark.sql, 'types'): # pyspark >= 1.3 yield HiveContext(sc) else: yield SQLContext(sc) finally: dbpath = 'metastore_db' logpath = 'derby.log' if os.path.exists(dbpath): assert os.path.isdir(dbpath) shutil.rmtree(dbpath) if os.path.exists(logpath): assert os.path.isfile(logpath) os.remove(logpath)
def process(sc): hiveContext = HiveContext(sc) hql = "select * from kmeans_cluster_feature where pt = '%s'" % (pt) df_raw = hiveContext.sql(hql).repartition(160) columns = df_raw.columns[1: -2] feature_num = len(columns) # type #df_tmp = df_raw #for k, i in zip(columns, range(feature_num)): # df_tmp = df_tmp.withColumn(k, df_tmp[i + 1] * 1.0) # Imputer mean_value = df_raw.describe().collect()[1] print mean_value df_train = df_raw for k, i in zip(columns, range(feature_num)): df_train = df_train.na.fill({k:mean_value[i + 1]}) # minmax vecAssembler = VectorAssembler(inputCols=columns, outputCol="features") df_b_s = vecAssembler.transform(df_train) mmScaler = MinMaxScaler(inputCol="features", outputCol="scaled") model = mmScaler.fit(df_b_s) df_scaled = model.transform(df_b_s) # kmeans n_clusters_ = 20 model = KMeans(k=n_clusters_, initSteps=10, maxIter=300, featuresCol='scaled').fit(df_scaled) df_result = model.transform(df_scaled) # map global sensitivity_1, sensitivity_3 sensitivity_1 = [] sensitivity_2 = [] sensitivity_3 = [] key_cnt = [] centers = model.clusterCenters() for xx, yy in zip(centers, range(n_clusters_)): key_cnt.append([yy, xx[0]]) sorted_cluster = sorted(key_cnt, key=lambda asd: asd[1]) split = n_clusters_ / 3 split_end = n_clusters_ - split for xx, yy in zip(sorted_cluster, range(n_clusters_)): if yy < split: sensitivity_3.append(xx[0]) elif yy >= split_end: sensitivity_1.append(xx[0]) else: sensitivity_2.append(xx[0]) #result df_result.map(result_process).saveAsTextFile("kmeans_cluster_result/pt=%s/" % (pt))
def load_dataset(): '''Return Real Telco customers and labels.''' #df = pd.read_excel(ibmxlsxpath) conf = SparkConf().setAppName("Telco Churn IRL") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) df = sqlContext.sql("select * from jfletcher.churn_test_3").toPandas() df = drop_missing(df).reset_index() df.index.name = 'id' features, labels = utils.splitdf(df, labelcol) features = booleanize_senior_citizen(features) features = utils.drop_non_features(features, cols) features = utils.categorize(features, cols) labels = (labels == 'Yes') return features, labels
def run_hive(): dic = [{'id': '1,2,3'}] # df = pd.DataFrame(dic) sc = SparkContext() # sc.parallelize(dic) sql_ctx = HiveContext(sc) # sql_ctx.registerDataFrameAsTable(df, "aaa") sdf = sql_ctx.createDataFrame(dic) sdf.registerTempTable('aaa') # sdf.show() # df2 = sql_ctx.sql('select split(id,',') from aaa') df2 = sql_ctx.sql( 'select select collect_list(cast (explode(split(id,",")) AS string)) from aaa' ) df2.show()
def get_sqlContext(): import sys import os try: sc.stop() except: pass spark_home = '/opt/cloudera/parcels/CDH/lib/spark/' os.environ['SPARK_HOME'] = spark_home sys.path.insert(0, os.path.join(spark_home, 'python')) sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.9-src.zip')) from pyspark import SparkContext, SparkConf, HiveContext conf = SparkConf().setAppName( 'drops_finding' )\ .setMaster( 'yarn-client' )\ .setExecutorEnv('PATH', os.environ[ 'PATH' ] ) \ .set('spark.executor.core', '5' )\ .set('spark.executor.memory', '25g' )\ .set('spark.driver.core', '5' )\ .set('spark.driver.memory', '25g' )\ .set('spark.yarn.driver.memoryOverhead', '4096' ) \ .set('spark.yarn.executor.memoryOverhead', '4096' )\ .set('spark.kryoserializer.buffer.max', '2047')\ .set('spark.driver.maxResultSize', '8g')\ .set("spark.dynamicAllocation.enabled","true") \ .set("spark.dynamicAllocation.minExecutors","10") \ .set("spark.dynamicAllocation.maxExecutors","16") \ .set("spark.dynamicAllocation.initialExecutors","10") \ .set("spark.dynamicAllocation.executorIdleTimeout","60s") \ .set("spark.dynamicAllocation.schedulerBacklogTimeout","5s") \ .set("spark.dynamicAllocation.sustainedSchedulerBacklogTimeout","5s") \ sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) return sc, sqlContext
def process_sql(): filename = '/Users/baoqiang/Downloads/1.txt' sc = SparkContext() sql_ctx = HiveContext(sc) df = sql_ctx.read.json(filename) # keywords = ["小包", "小钰"] # keywords = ['"{}"'.format(keyword) for keyword in keywords] # df = df.where('score > 5 or keyword in ({})'.format(', '.join(keywords))) token = '2ZDMkVAQVjN' # df = df.where('token = "{}" and get_json_object(share_data, "$.[0].ShareCategory") = 2'.format(token)) # df = df.where('token = "{}"'.format(token)) # df.show() df.registerTempTable("events") q1 = 'SELECT get_json_object(share_data, "$.[0].ShareCategory"),token FROM events where token = "{}"'.format( token) res = sql_ctx.sql(q1) res.show()
def process_sql_sample(): filename = '/Users/baoqiang/Downloads/3.txt' sc = SparkContext() sql_ctx = HiveContext(sc) df = sql_ctx.read.json(filename) df.registerTempTable("events") # q1 = 'SELECT get_json_object(students, "$.[0].name") as name,* FROM events' \ # 'lateral view explode(split(userl_ids,"[[[")) snTable as user_id where id = {}'.format(1) q1 = "select explode(split(substring(students,3,length(students)-4),'\\\\},\\\\{')) as student from events" q2 = "select id,concat('{',student,'}') as entities from (select * from events) a " \ "lateral view explode(split(substring(students,3,length(students)-4),'\\\\},\\\\{')) b as student" q3 = "select id,get_json_object(entities,'$.name') as name from (" \ "select id,concat('{',student,'}') as entities from (select * from events) a " \ "lateral view explode(split(substring(students,3,length(students)-4),'\\\\},\\\\{')) b as student )" res = sql_ctx.sql(q3) res.show()
def __init__(self): self.localClusterURL = "local[2]" self.clusterMasterURL = "spark://Master:7077" self.conf = SparkConf().setAppName('ELT').setMaster( self.localClusterURL) self.sc = SparkContext.getOrCreate(self.conf) self.sqlContext = SQLContext(self.sc) self.hc = HiveContext(self.sc) self.jdbcURL = "jdbc:mysql://Master:3306/recommend?useUnicode=true&characterEncoding=utf-8&useSSL=false" self.prop = { 'dirver': 'com.mysql.jdbc.Driver', 'user': '******', 'password': '******' } # user\rating\links\tags在hdfs中的位置 ===> 即推荐原料在hdfs中的存档路径 self.hdfs_data_path = 'hdfs://Master:9000/movie/data/' self.movies_path = self.hdfs_data_path + 'movies.txt' self.ratings_path = self.hdfs_data_path + 'ratings.txt' self.links_path = self.hdfs_data_path + 'links.txt' self.tags_path = self.hdfs_data_path + 'tags.txt' # 各种result数据在mysql中的表 self.default5Table = 'MovieSizer.operation_default5recommend' self.top5Table = 'MovieSizer.oertion_top5recomm' self.alsTable = 'MovieSizer.movies_alsTab' self.similarTable = 'MovieSizer.movies_movidesimilar' self.usesrTable = 'MovieSizer.usesr_userprofile' self.ratingTable = 'MovieSizer.operation_rating' self.movieTab = 'MovieSizer.movies_movieinfo' self.tagTab = 'MovieSizer.movies_movieinfo_typelist' # 设置RDD的partition的数量一般以集群分配给应用的CPU核数的整数倍为宜。 self.minPartitions = 8
def _initialize_spark_contexts(gateway): java_spark_context = gateway.entry_point.getSparkContext() java_spark_conf = java_spark_context.getConf() spark_context = SparkContext( conf=SparkConf(_jvm=gateway.jvm, _jconf=java_spark_conf), gateway=gateway, jsc=java_spark_context) java_spark_sql_session = gateway.entry_point.getSparkSQLSession() spark_version = spark_context.version spark_sql_session = None if spark_version == "1.6.1": from pyspark import HiveContext java_sql_context = java_spark_sql_session.getSQLContext() spark_sql_session = HiveContext(spark_context, java_sql_context) elif spark_version in ["2.0.0", "2.0.1", "2.0.2"]: from pyspark.sql import SparkSession java_spark_session = java_spark_sql_session.getSparkSession() spark_sql_session = SparkSession(spark_context, java_spark_session) else: raise ValueError("Spark version {} is not supported".format(spark_version)) return spark_context, spark_sql_session
def hive(string): try: global sc hive_context = HiveContext(sc) colunas, nome_tab, condicao, compl = string.split("-") if condicao != '0': condicao = condicao.replace('_', ' ') condicao = 'and ' + condicao else: condicao = '' if compl != '0': compl = compl.replace('_', ' ') return str( hive_context.sql("select " + colunas + " from " + nome_tab + " where 1=1 " + condicao + compl).collect()) else: return str( hive_context.sql("select " + colunas + " from " + nome_tab + " where 1=1 " + condicao).collect()) except e: pass
def create_context(parameters=None): if parameters is None: parameters = OrderedDict() parameters['spark.app.name'] = 'weta_workflow' parameters['spark.master'] = 'local' # 'yarn' parameters["spark.executor.instances"] = "8" parameters["spark.executor.cores"] = "8" parameters["spark.executor.memory"] = "2g" parameters["spark.driver.cores"] = "4" parameters["spark.driver.memory"] = "1g" parameters["spark.logConf"] = "false" parameters["spark.app.id"] = "dummy" # parameters['spark.debug.maxToStringFields'] = 100 cls = SparkEnvironment if cls._sc: cls._sc.stop() for key, parameter in parameters.items(): cls._conf.set(key, parameter) cls._sc = SparkContext(conf=cls._conf) cls._sqlContext = SQLContext(cls._sc) cls._hc = HiveContext(cls._sc)
# coding:utf-8 from pyspark import SparkConf, SparkContext, HiveContext from a2_week_add_index import * import pandas as pd import datetime import sys conf = SparkConf() sc = SparkContext() hql = HiveContext(sc) def get_save_table(flag): if int(flag) == 0: save_table = 'c3_top2_stock_feature_train' else: save_table = 'c3_top2_stock_feature_test' return save_table def get_is_label(flag): if int(flag) == 0: is_label = ',label' else: is_label = '' return is_label args = sys.argv[1:] if len(args) == 0: print '未传入参数一flag 默认使用flag=0 即训练集样本'
print(sql) hiveCtx.sql(sql).registerTempTable("temp_table") insert_sql = """ insert overwrite table {table_name} partition(dt='{dt}') select * from temp_table """.format(table_name=table_name, dt=dt_str) print("insert_sql:\n" + insert_sql) hiveCtx.sql(insert_sql) if __name__ == "__main__": conf = SparkConf() sc = SparkContext(conf=conf, appName="sp-tfidf") sc.setLogLevel("WARN") hiveCtx = HiveContext(sc) hiveCtx.setConf('spark.shuffle.consolidateFiles', 'true') hiveCtx.setConf('spark.shuffle.memoryFraction', '0.4') hiveCtx.setConf('spark.sql.shuffle.partitions', '1000') if len(sys.argv) == 1: dt = datetime.datetime.now() + datetime.timedelta(-1) else: dt = datetime.datetime.strptime(sys.argv[1], "%Y%m%d").date() dt_str = dt.strftime("%Y-%m-%d") yest_dt = dt + datetime.timedelta(-30) yest_str = yest_dt.strftime("%Y-%m-%d") hiveCtx.sql("use app") create_table(hiveCtx) getQuery(hiveCtx)
def __init__(self): conf=SparkConf().set('spark.sql.shuffle.partitions','50').set('spark.jars.packages','ml.combust.mleap:mleap-spark-base_2.11:0.7.0,ml.combust.mleap:mleap-spark_2.11:0.7.0') sc=SparkContext(conf=conf) sc.setLogLevel('WARN') self.hc=HiveContext(sc)
from pyspark import SparkConf,SparkContext,HiveContext APP_NAME="read-json" def main(sc,sqlC): df=sqlC.read.json("./data.json") df.show() df.printSchema() df.select("age").show() # rdd的transformation df.filter(df["age"]>20).show() # 可选格式:json,orc,parquet df.write.format("orc").saveAsTable("people",mode="overwrite") df.groupBy("age").count().show() if __name__=="__main__": conf=SparkConf().setAppName(APP_NAME) conf=conf.setMaster("local[*]") sc=SparkContext(conf=conf) sqlC=HiveContext(sc) main(sc,sqlC)
from __future__ import print_function try: import findspark findspark.init() import pyspark sc = pyspark.SparkContext() sc.setLogLevel('WARN') print("spark context created") from pyspark import SparkConf, SparkContext, HiveContext from pyspark.sql import SQLContext sqlc = SQLContext(sc) sqlh = HiveContext(sc) sqlh.setConf("spark.sql.parquet.compression.codec.", "gzip") except: print("spark context exists")
maxDepth=10) preds = model.predict(lp_check.map(lambda x: x.features)) labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy( lambda x: x[1], ascending=False) for each in labels_and_preds.take(100): print each labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy( lambda x: x[1], ascending=True) for each in labels_and_preds.take(100): print each mse = labels_and_preds.map( lambda x: math.pow(x[0] - x[1], 2)).sum() / labels_and_preds.count() print mse mse = labels_and_preds.map( lambda x: math.pow(x[0] - 1.0, 2)).sum() / labels_and_preds.count() print mse if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "8g") sc = SparkContext(appName="bintrade.post.index", conf=conf) sql_context = HiveContext(sc) main(sc, sql_context, is_hive=True) sc.stop()
def rdd_to_spark_df_or_srdd(rdd, **kwargs): return append(HiveContext(rdd.context), rdd, **kwargs)
#Validating DateArg using Regex. matchObj1 = re.match( r'[0-9]{4}[-][0][1-9]|[0-9]{4}[-][1][0-2]', DateArg) if (len(DateArg) == 7): if (matchObj1): print("Hi, correct argument") else: print('Invalid input, please try again in (YYYY-MM) format. \nTerminating Program..........') #Terminate program in case DateArg value is invalid. sys.exit(0) #Initializing SparkContext and HiveContext. conf = SparkConf().setAppName('Mini Bridge Table').set("spark.executor.memory", "64g").set("spark.driver.memory", "32g") sc = SparkContext(conf = conf) hc = HiveContext(sc) windowdf = hc.sql("""SELECT DISTINCT rowid_cdh_household, rowid_cdh_party, last_update_date FROM ( SELECT rowid_cdh_household, rowid_cdh_party, last_update_date, rank() OVER (PARTITION BY rowid_cdh_party ORDER BY last_update_date DESC) AS rank FROM t_sda01.c_cdh_household_prty_rel WHERE last_update_date < '2012-10-09 10:10:01') AS tmp
from pyspark import SparkConf, SparkContext from pyspark import HiveContext from pyspark.sql.types import DoubleType from pyspark.sql.types import IntegerType from pyspark.sql import functions as func from pyspark.sql.functions import col hive_context = HiveContext(sc) procedure = hive_context.table("default.procedure") hospitals = hive_context.table("default.hospital") procedure_typecast = procedure.withColumn( "score", procedure["score"].cast(DoubleType())).withColumn( "sample", procedure["sample"].cast(IntegerType())).withColumn( "denominator", procedure["denominator"].cast(IntegerType())) procedure_hospital = procedure_typecast.join( hospital, procedure_typecast.provider_id == hospital.provider_id) # subset for those procedures that have a score not higher than 100 and a sample of at least 50 score_avg = procedure_hospital.where((procedure_hospital['score'] <= 100) & ( procedure_hospital['sample'] > 50)).groupby('state').agg(func.avg('score')) # show the 10 best states best_states = score_avg.sort(score_avg['avg(score)'].desc()).show(10)
MOUNT_NAME = "sparkfish" dbutils.fs.mount("s3n://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME) display(dbutils.fs.ls("/mnt/sparkfish")) # COMMAND ---------- # MAGIC %python # MAGIC from pyspark.sql import functions as F # MAGIC from pyspark.sql.functions import datediff, to_date, lit, unix_timestamp,split # MAGIC from pyspark.sql.types import * # MAGIC # MAGIC # Build DataFrame dataset to work with. # MAGIC formatPackage = "csv" if sc.version > '1.6' else "com.databricks.spark.csv" # MAGIC df = sqlContext.read.format(formatPackage).options(header='true', delimiter = ',').load("dbfs:/mnt/sparkfish/titanic.csv") # MAGIC data_df=df.withColumn("Age", df["Age"].cast(IntegerType())) # MAGIC data_df.printSchema() # MAGIC data_df.write.saveAsTable('sparkfishTable', format='parquet', mode='overwrite',path='dbfs:/mnt/sparkfish/sparkfishTable/') # COMMAND ---------- from pyspark import SparkContext, HiveContext hiveContext = HiveContext(sc) display(hiveContext.sql("SELECT percentile(Age, 0.75) FROM sparkfishTable")) # COMMAND ---------- from pyspark import SparkContext, HiveContext hiveContext = HiveContext(sc) display(hiveContext.sql("SELECT avg(Age) FROM sparkfishTable"))
def analyze_column(sc, X): """analyze column by Note: Args(object): Spark DataFrame created MatrixCreator Return: colInfo(dictionary): { 'version': 'test_version' 'preprocess': { 'all': [col1, col2, ... , colN], 'singleton': [col1, col2, ... , colN], 'string': [col1, col2, ... , colN], 'final': [col1, col2, ... , colN] } } """ hc = HiveContext(sc) colInfo = {} preprocess = {} # 1. Create X matrix from matrixCreator # 2. Sampling fro this X matrix # 3. trasform the X matrix to dictionary # 4. print 'sampling from matrix' if X.is_cached: print 'the matrix is cached' else: print 'the matrix is somehow not cached, WTF!!!' sampleX = (X.sample(withReplacement=False, fraction=0.005, seed=42).map(lambda x: x.items)) df = pd.DataFrame(sampleX.collect()) # df = df.toPandas() print 'sampling done' preprocess['all'] = list(df.columns) ori_num = len(df.columns) dfNu = df.apply(pd.to_numeric, errors='coerce') dfNuRM = dfNu.dropna(axis=1, how='all') nu_num = len(dfNuRM.columns) diff = list(set(df.columns) - set(dfNuRM.columns)) preprocess['string'] = diff remove_count = 0 colSet = set(dfNuRM.columns) for col in dfNuRM.columns: if len(dfNuRM[col].value_counts()) == 1: del dfNuRM[col] remove_count += 1 print 'There are {} columns being removed'.format(remove_count) diff = list(colSet - set(dfNuRM.columns)) preprocess['singleton'] = diff preprocess['final'] = list(dfNuRM.columns) colInfo['preprocess'] = preprocess try: listLen = 0 listList = [] for key in colInfo['preprocess'].keys(): if key != 'all': listLen += len(colInfo['preprocess'][key]) listList += list(colInfo['preprocess'][key]) if ((listLen == len(colInfo['preprocess']['all'])) & (set(list(colInfo['preprocess']['all'])) == set(listList))): print 'the number is correct' except: print 'the number is incorrect' return colInfo
from pyspark import SparkContext from pyspark import HiveContext sc = SparkContext() hospitals = HiveContext(sc).sql('from hospitals select *') hospitals.show()
#import pandas as pd import commands import ast import itertools import pyspark.sql.functions from pyspark.sql.functions import col from pyspark.sql.functions import current_date from datetime import datetime, timedelta from collections import Counter import re #import numpy as np from pyspark import SparkContext, SparkConf, HiveContext sc = SparkContext.getOrCreate() from pyspark.sql import SQLContext sqlContext = HiveContext(sparkContext=sc) sqlCtx = HiveContext(sparkContext=sc) def freq(lst): d = {} for i in lst: if d.get(i): d[i] += 1 else: d[i] = 1 return d def get_nested_keys(a): key_list = []
def hiveContext(sparkContext): return HiveContext(sparkContext)
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_F_CI_SUN_CREDIT_EXT').setMaster( sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
def hive_context(spark_context): return HiveContext(spark_context)