def _upload_to_hive(self, logs):

        sql_context = HiveContext()

        # update Hive table
        df_writer = DataFrameWriter(logs)
        df_writer.insertInto(self.destination_table, overwrite=True)
Example #2
0
    def create_VP_tables(self):
        print "Beginning the creation of VP tables."
        total_properties = len(self.properties)
        i = 0
        # for each distinct property, create a table
        for p in self.properties:
            i += 1
            prop_df = self.sqlContext.sql(
                "SELECT s AS s, o AS o FROM tripletable WHERE p='" + p + "'")
            df_writer = DataFrameWriter(prop_df)
            df_writer.saveAsTable("VP_" + valid_string(p))
            sys.stdout.write("\rTables created: %d / %d " %
                             (i, total_properties))

        # if statistics are enabled, compute them
        if self.statsEnabled:
            i = 0
            stat = Stats()
            for p in self.properties:
                i += 1
                tableDF = self.sqlContext.sql("SELECT * FROM VP_" +
                                              valid_string(p))
                stat.addTableStat(p, tableDF)
                sys.stdout.write("\rStatistics created: %d / %d " %
                                 (i, total_properties))
            with open(self.statsFile, "w") as f:
                f.write(stat.getSerializedStats())
        print "Statistics created: %d / %d " % (i, total_properties)
def write_to_db(df,
                table,
                mode='append',
                database='project_db',
                schema='public'):
    db_property = connect_to_db(database)
    #Write to database
    writer = DataFrameWriter(df)
    writer.jdbc(db_property['url'], table, mode, db_property)
 def write(self, df, table, md):
     """
     Args:
         df: pandas.dataframe
         table: str
         md: str 
     :rtype: None
     """
     writer = DataFrameWriter(df)
     writer.jdbc(self.url_connect, table, md, self.properties)
Example #5
0
    def write_jdbc(self, df, table, mode):
        writer = DataFrameWriter(df)
        url = self.cf.get('mysql', 'url')
        properties = {
            'user': self.cf.get('mysql', 'user'),
            'password': self.cf.get('mysql', 'password'),
            'driver': self.cf.get('mysql', 'driver')
        }

        writer.jdbc(url, table, mode, properties)
Example #6
0
 def write(self, df, table_name, mode):
     """
     Args:
         df: spark.dataframe
         table_name: str
         moded: str 
     :rtype: None
     """
     writer = DataFrameWriter(df)
     writer.jdbc(self.url_connect, table_name, mode, self.properties)
Example #7
0
def main():
    sc = SparkContext()
    hc = HiveContext(sc)

    df = hc.sql("""{{sql}}""")
    df_writer = DataFrameWriter(df)
    df_writer.saveAsTable(name='{{tableName}}',
                          format='json',
                          mode='overwrite',
                          path='s3://data/{{tableName}}')
Example #8
0
def write_to_DB(df, table_name):
    url = 'jdbc:postgresql://{}/{}'.format(config['postgres_conn']['host'],
                                           config['postgres_conn']['database'])
    properties = {
        'user': config['postgres_conn']['user'],
        'password': config['postgres_conn']['password'],
        "driver": "org.postgresql.Driver"
    }
    #tablename = 'events'
    writer = DataFrameWriter(df)
    writer.jdbc(url=url,
                table=table_name,
                properties=properties,
                mode='append')
Example #9
0
def saveToHive(spark,out_rdd,out_table):
    schema = StructType([StructField("serial_number_pre", StringType(), True),
                         StructField("pcb_number_pre", StringType(), True),
                         StructField("serial_number_tmp", StringType(), True),
                         StructField("chain", StringType(), True)])
    df = spark.createDataFrame(out_rdd,schema)
    DataFrameWriter(df).insertInto(out_table)
Example #10
0
def write_to_postgres(df, table, mode):
    DataFrameWriter(df).jdbc(
        POSTGRES_URL, table, mode, {
            'user': POSTGRES_USER,
            'password': POSTGRES_PWD,
            'driver': 'org.postgresql.Driver'
        })
Example #11
0
def format_save(writer: DataFrameWriter, data_source: DataSource) -> None:
    if data_source.data_format is DataFormat.CSV:
        writer.option("header", True).csv(data_source.source_path)
    if data_source.data_format is DataFormat.PARQUET:
        writer.parquet(data_source.source_path)
    if data_source.data_format is DataFormat.JSON:
        writer.json(data_source.source_path)
    if data_source.data_format is DataFormat.JDBC:
        if type(data_source) is JDBCDataSource:
            ## TODO
            writer.jdbc()
Example #12
0
    def stat_act_time(self):
        """
        这里是统计每张卡每个月每种业务(定、活、理)办理的次数
        :return:
        """
        target_table = 't_CMMS_TEMP_ACTTIME'
        save_mode = 'append'
        acc_detail_source = self.sc.textFile('%s/jjzhu/test/input/t_CMMS_ACCOUNT_DETAIL.txt' % self.hdfs_root_url)
        # 这里是更新t_CMMS_TEMP_ACTTIME表的
        acc_list = self.load_from_mysql('core', 't_CMMS_ACCOUNT_LIST') \
            .map(lambda row: (row.asDict()['ACC_NO15'], (row.asDict()['CUST_NO'], row.asDict()['ACC_NAM']))).distinct()
        acc_list.cache()  # 添加缓存
        split_rdd = acc_detail_source.map(lambda line: line.split(','))
        split_rdd.cache()  # 添加缓存
        start_time = '2016-02-01'  # TODO 之后得改成最早的时间
        end_time = '2016-08-01'  # TODO 最晚的时间
        time_slots = self.get_time_slot(start_time, end_time)
        for slot in time_slots:
            self.logger.info('statistic action time of %s' % slot[0][0: slot[0].rindex('-')])
            # 以 客户号+业务类型为key,统计客户不同类型的交易次数
            act_time = split_rdd.filter(lambda columns: columns[1] <= slot[1]) \
                .filter(lambda columns: columns[1] >= slot[0]) \
                .map(lambda columns: (columns[3] + '_' + columns[6], 1)) \
                .reduceByKey(lambda a, b: a + b)

            mapped_act_time = act_time.map(lambda fields:
                                           (fields[0].split('_')[0],
                                            (fields[0].split('_')[0], fields[0].split('_')[1],
                                             fields[1], slot[0][0: slot[0].rindex('-')])))
            # join操作,连接客户号对应的具体信息
            result = mapped_act_time.join(acc_list).map(lambda fields: fields[1][0] + fields[1][1])

            #  ACCT_NO15, BUST_TYPE, ACT_TIME, CURR_MONTH, CUST_NO, ACCT_NAM
            # '101006463122653', '1', 25, '2016-02-01', '81024082971', '曹镇颜'

            result_writer = DataFrameWriter(
                self.sql_context.createDataFrame(result, ['ACCT_NO15', 'BUST_TYPE', 'ACT_TIME',
                                                          'CURR_MONTH', 'CUST_NO', 'ACCT_NAM']))
            self.logger.info('save the statistic result into mysql\n'
                             'url: %s\n'
                             'target table: %s\n'
                             'mode: %s' % (self.mysql_url, target_table, save_mode))
            result_writer.jdbc(self.mysql_url, table=target_table, mode=save_mode)
Example #13
0
def write_dataframe_to_postgres2(df, table, mode):
    df = df.withColumn('listing_id', df.listing_id.cast('INT'))
    df = df.withColumn('id', df.id.cast('INT'))
    df = df.withColumn('date', Func.to_timestamp(df.date, format='YYYY-MM-DD'))
    df = df.withColumn('reviewer_id', df.reviewer_id.cast('INT'))
    df = df.withColumn('reviewer_name', df.reviewer_name.cast('STRING'))
    df = df.withColumn('comments', df.comments.cast('STRING'))
    #df = df.withColumn('tone', 0.0)

    DataFrameWriter(df).jdbc(POSTGRES_URL, table, mode, {
        'user': POSTGRES_USER,
        'password': POSTGRES_PWD,
        'driver': 'org.postgresql.Driver'
    })
Example #14
0
def write_dataframe_to_postgres(df, table, mode):
    df = df.withColumn('incident_datetime', Func.to_timestamp(df.incident_datetime, format='YYYY/MM/DD HH24:MI:SS PM'))
    df = df.withColumn('incident_date', Func.to_timestamp(df.incident_date, format='YYYY/MM/DD'))
    df = df.withColumn('incident_time', Func.to_timestamp(df.incident_time, format='HH24:MI'))
    df = df.withColumn('incident_year', df.incident_year.cast('INT'))
    df = df.withColumn('incident_day_of_week', df.incident_day_of_week.cast('STRING'))
    df = df.withColumn('incident_id', df.incident_id.cast('INT'))
    df = df.withColumn('incident_category', df.incident_category.cast('STRING'))
    df = df.withColumn('incident_subcategory', df.incident_subcategory.cast('STRING'))
    df = df.withColumn('incident_description', df.incident_description.cast('STRING'))
    df = df.withColumn('resolution', df.resolution.cast('STRING'))
    df = df.withColumn('analysis_neighborhood', df.analysis_neighborhood.cast('STRING'))
    df = df.withColumn('latitude', df.latitude.cast('FLOAT'))
    df = df.withColumn('longitude', df.longitude.cast('FLOAT'))

    DataFrameWriter(df).jdbc(POSTGRES_URL, table, mode, {
        'user': POSTGRES_USER,
        'password': POSTGRES_PWD,
        'driver': 'org.postgresql.Driver'
    })
Example #15
0
def save_df(df, day_of_week, passenger_count):
    storage_dir = '/hadoop/cms/store/user/jgran/taxi/saved_dataframes/'
    name = 'df_'+day_of_week+'_'+passenger_count
    #df_writer = DataFrameWriter(df.coalesce(50))
    df_writer = DataFrameWriter(df)
    df_writer.parquet(storage_dir+name)
Example #16
0
 def get_writer(self, df):
     return DataFrameWriter(df)
from pyspark.sql import DataFrameWriter


def write_df_to_table(df_writer, table):
    jdbc_url = "jdbc:postgresql://0.0.0.0:5432/postgres"
    mode = "overwrite"
    properties = {
        "user": "******",
        "password": "******",
        "driver": "org.postgresql.Driver"
    }

    print("Writing to {}".format(table))

    df_writer.jdbc(jdbc_url, table, mode, properties)


# Write df2
write_df_to_table(DataFrameWriter(df2), "stock_data")

# Write monthly average close
write_df_to_table(DataFrameWriter(monthly_avg_close_df), "avg_month_close")

# Write adj close diff than close count
write_df_to_table(DataFrameWriter(adj_close_diff_than_close),
                  "adjusted_close_count")

print("DB Write complete")

print("Complete")
Example #18
0
def saveToHive(hiveContext, rddData, out_table):
    hiveContext.sql("truncate table " + out_table)
    df = hiveContext.createDataFrame(
        rddData, ["defectid", "test_tools", "loaded_timestamp"])
    DataFrameWriter(df).insertInto(out_table)
    logging.info("Test tool predict finished successed.")
from pyspark import SparkConf
from pyspark.sql import SQLContext

# spark context
sc=SparkContext()
#sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", "XXXX")
#sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", "XXX")

# SparkSQL context
sqlContext = SQLContext(sc)

filename = "RC_2006-01.parquet"
destination="s3a://reddit-comments-raw/2006/comments_2006_05.parquet"
data = sqlContext.read.parquet(destination)
print("done",data.count())

from pyspark.sql import DataFrameWriter
my_writer = DataFrameWriter(data)


md="overwrite"
database_name = 'reddit'
hostname = 'ec2-54-214-117-182.us-west-2.compute.amazonaws.com'
url_connect = "jdbc:postgresql://{hostname}:5000/{db}".format(hostname=hostname, db=database_name)
properties = {"user":"******","password" : "webuiuser","driver": "org.postgresql.Driver"}
table="word_count"

my_writer.jdbc(url=url_connect,table= table,mode=md,properties=properties)


Example #20
0
def save_df(df, day_of_week, passenger_count):
    storage_dir = '/hadoop/cms/store/user/jgran/taxi/saved_dataframes/'
    name = 'df_' + day_of_week + '_' + passenger_count
    #df_writer = DataFrameWriter(df.coalesce(50))
    df_writer = DataFrameWriter(df)
    df_writer.parquet(storage_dir + name)
        phone = ob.get("bankpremobile", None)
    else:
        phone = None
    return (phone, idCard, idBank, name)


# xiaoshudian_app_key = "1186159692"
# xiaoshudian_tec_api = ["tel","address_getbymobile","channel_NameIDCardAccountVerify","channel_cellphone","operator_capricorn","address_match","channel_idcard","channel_bankby3","channel_idNameFase","channel_criminal","channel_blacklistverify","credit_implement"]
# def filter(app_key,api):
#     not (app_key in xiaoshudian_app_key and api not in  xiaoshudian_tec_api)


data_rdd = data.rdd.map(lambda a: (a.app_key_param, a.date, standard_params(a.params), a.interface, a.api_type)) \
    .map(lambda (a, b, c, d, e): (a, b, c[0], c[1], c[2], c[3], d, e))
'''
c[0]->phone
c[1]->idcard
c[2]->idbank
c[3]->name
'''
schemaStr = "app_key date phone idcard idbank name interface api_type"
fields = [
    StructField(field_name, StringType(), True)
    for field_name in schemaStr.split()
]
schema = StructType(fields)

data_df = hc.createDataFrame(data_rdd, schema).distinct()
dfw = DataFrameWriter(data_df)
dfw.saveAsTable("wl_analysis.t_lel_record_data_backflow", mode="overwrite")
def saveToHive(hiveContext,rddData,out_table):
	hiveContext.sql("truncate table "+out_table)
	df = hiveContext.createDataFrame(rddData, ["defectid", "categories","loaded_timestamp"])
	DataFrameWriter(df).insertInto(out_table)
    lambda a, b: a + b).map(lambda x: tfin(x))

# In[27]:

#rdd2.take(10)

# In[23]:

tdf2 = hiveContext.createDataFrame(rdd2, ['uid', 'loc_ts_dur', 'data_dt'])

# In[24]:

#tdf2.collect()

# In[25]:

df_writer = DataFrameWriter(tdf2)
df_writer.insertInto(oHiveTable, overwrite=True)

# In[26]:

sc.stop()

# In[31]:

#toLocDurTuples('101,2016-06-01@12:04:02,[40:50][202:203],20160601')

# In[32]:

#tfin(('101_20160601', [(202, 1464782642, 40), (203, 1464782682, 50)]))
Example #24
0
            indices_to_terms(model.vocabulary)("termIndices"))
        #ti.persist(StorageLevel.MEMORY_AND_DISK)

        ti = ti.withColumn('topicString', parseInts(ti["topic"]))
        ti = ti.withColumn('termWeightsString', parseThetas(ti["termWeights"]))
        ti = ti.withColumn('topics_wordsString',
                           parseTerms(ti["topics_words"]))

        #print(ti.columns)

        #save file - terms per topic
        #f = open(savePath + 'terms.csv','w')

        #save file - topic distribution and terms

        writer = DataFrameWriter(df2.select('id', 'thetaString'))
        writer.csv('file://' + savePath + thisSet + '.thetas')

        writer = DataFrameWriter(
            ti.select('topicString', 'termWeightsString',
                      'topics_wordsString'))
        writer.csv('file://' + savePath + thisSet + '.terms')
        """
		#save file - terms per topic
		f = open(savePath + thisSet + '.terms.csv','w')

		for row in ti2:
			items = len(row[3])
			for i in range(items):
				f.write(str(row[0]) + ',' + row[3][i].encode('utf-8') + ',' + str(row[2][i]) + '\n')
Example #25
0
def _aliased_writer(df_writer: DataFrameWriter, format_key: str,
                    path: Optional[str], **options: str):
    """ Saves the dataframe to a file of the given type at the given path."""
    return df_writer.format(format_key).save(path, **options)
Example #26
0
 def save_df(self, path, df):
     self.delete_from_hdfs(path)
     writer = DataFrameWriter(df)
     writer.save(path)
Example #27
0
    if ob.has_key("mobile"):
        phone = ob.get("mobile", None)
    elif ob.has_key("phone"):
        phone = ob.get("phone", None)
    elif ob.has_key("enc_m"):
        phone = ob.get("enc_m", None)
    elif ob.has_key("ownerMobile"):
        phone = ob.get("ownerMobile", None)
    elif ob.has_key("bankpremobile"):
        phone = ob.get("bankpremobile", None)
    else:
        phone = None
    return (phone, idCard, idBank, name)


# xiaoshudian_app_key = "1186159692"
# xiaoshudian_tec_api = ["tel","address_getbymobile","channel_NameIDCardAccountVerify","channel_cellphone","operator_capricorn","address_match","channel_idcard","channel_bankby3","channel_idNameFase","channel_criminal","channel_blacklistverify","credit_implement"]
# def filter(app_key,api):
#     not (app_key in xiaoshudian_app_key and api not in  xiaoshudian_tec_api)
data_rdd = data.rdd.map(lambda a:(a.app_key_param,a.date,standard_params(a.params_less),a.interface,a.api_type))\
    .map(lambda (a,b,c,d,e):(a,b,c[0],c[1],c[2],c[3],d,e))
schemaStr = "app_key date phone idcard idbank name interface api_type"
fields = [
    StructField(field_name, StringType(), True)
    for field_name in schemaStr.split()
]
schema = StructType(fields)
data_df = sqlContext.createDataFrame(data_rdd, schema).distinct()
dfw = DataFrameWriter(data_df)
dfw.saveAsTable("wl_analysis.t_lel_datamart_backflow_filtered",
                mode="overwrite")
Example #28
0
    def write(self, df, table, mode='append'):

        writer = DataFrameWriter(df)
        writer.jdbc(self.url_connect, table, mode, self.properties)
Example #29
0
    def init_balance(self, ctime, tp):
        """
        处理流程:
            方法接收 格式为'%Y-%m-%d'的ctime和业务类型tp
            while 当前月:
                调用存储过程 Calc_balance(该存储过程更新当前日期的余额)
                将当前余额保存到hdfs中
                天数+1
            将当前月每一天的余额复制到本地文件中进行合并xxxx_xx.data
            将xxxx_xx.data合并文件移到hdfs中去
            读取该文件,进行日均余额的计算,并保存结果到hdfs中
            将结果合并至一个文件xxxx_xx-r.data
            加载结果合并文件xxxx_xx-r.data,导入mysql数据库

        :param ctime:格式为'%Y-%m-%d'的ctime
        :param tp: 业务类型
        :return:
        """

        try:

            cdate = datetime.datetime.strptime(ctime, '%Y-%m-%d')
            end_date = cdate
            # 合并后的文件名为 年份_月份.data
            merged_file_name = '%s.data' % cdate.strftime('%Y_%m')

            result_file_name = '%s-r.data' % cdate.strftime('%Y_%m')
            # hdfs输入路径
            hdfs_root_path = '%s/jjzhu/test/_%s/%s/' % (self.hdfs_root_url, str(tp.value), cdate.strftime('%Y_%m'))
            # 本地的临时目录路径
            local_temp_path = '%s/temp/_%s/' % (self.local_root_url, str(tp.value))

            def exist_or_create(path):
                if not os.path.exists(path):
                    self.logger.warning('local path: %s is not exist' % path)
                    self.logger.info('creating dir: %s' % path)
                    os.makedirs(path)
                self.logger.info('local path: %s is already existed' % path)

            exist_or_create(local_temp_path)

            # 本地合并后的文件的输出路径
            local_output_path = '%s/output/_%s/' % (self.local_root_url, str(tp.value))
            exist_or_create(local_output_path)

            local_result_path = '%s/result/_%s/' % (self.local_root_url, str(tp.value))
            exist_or_create(local_result_path)
            # AUM计算结果的hdfs输出路径
            hdfs_save_path = '%s/jjzhu/test/result/_%s/%s' % (self.hdfs_root_url,
                                                              str(tp.value), cdate.strftime('%Y_%m'))
            # 计算AUM原文件的hdfs输入路径
            hdfs_input_path = '%s/jjzhu/test/input/_%s/' % (self.hdfs_root_url, str(tp.value))
            if os.system('hadoop fs -mkdir %s' % hdfs_input_path) == 0:
                self.logger.warning('%s is not exist, created it successful' % hdfs_input_path)
            target_save_table = 't_CMMS_ASSLIB_ASSET_c'  # TODO
            save_mode = 'append'
            local_output_file_path = local_output_path + merged_file_name
            local_dir_for_merge = local_temp_path + cdate.strftime('%Y_%m')

            # Row()转换成tuple
            def change_row(row):
                row_dict = row.asDict()
                return (row_dict['CUST_NO'], row_dict['ACCT_NO15'], str(row_dict['CURR_BAL']),
                        str(row_dict['TIME']), row_dict['CUR'],
                        row_dict['CURR_DATE'].strftime('%Y-%m-%d'),
                        str(row_dict['BUSI_TYPE']))

            def calc_avg(el):
                r = el[1]
                return (r[0], r[1], float(r[2]) / int(r[3]), r[4],
                        datetime.datetime.strptime(str(r[5]), '%Y-%m-%d').strftime('%Y-%m'), r[6])

            # 将hdfs上每个月的数据移到本地
            def copy_from_hdfs_to_local(hdfs_dir, local_dir):
                import sys
                if not os.path.exists(local_dir):
                    self.logger.warning('%s is not existed, create it first.' % local_dir)
                    os.system('mkdir ' + local_dir)
                    # exit(-1)
                shell_command = 'hadoop fs -copyToLocal %s %s' % (hdfs_dir, local_dir)
                self.logger.info('execute hdfs shell command: %s' % shell_command)
                os.system(shell_command)

            # 合并文件
            def merge_file(input_dir, output_path):
                import platform
                if output_path.__contains__('/data'):
                    output_file = open(output_path, 'w+')
                    if platform.system() == 'Windows':
                        deli = '\\'
                    elif platform.system() == 'Linux':
                        deli = '/'
                    else:
                        self.logger.error('unknown platform: %s' % platform.system())
                    for i in os.listdir(input_dir):
                        curr_dir = input_dir + deli + i
                        for j in os.listdir(curr_dir):
                            if j.startswith('part'):  # 只合并以part开头的输出文件
                                with open(curr_dir + deli + j) as f:
                                    # print(curr_dir+deli+j)
                                    for line in f.readlines():
                                        # print(line[1:len(line)-2])
                                        output_file.write(line[1:len(line) - 2] + '\n')

                    output_file.close()
                else:
                    self.logger.error('please make sure your input path is under /data')
                    exit(-1)

            while end_date.month == cdate.month:
                # print(end_date)
                hdfs_path = hdfs_root_path + end_date.strftime('%Y_%m_%d')
                # 调用mysql存储过程,更新当前余额
                self.logger.info('call Calc_balance(%s, %s)' %
                                 (end_date.strftime('%Y-%m-%d'), str(tp.value)))
                self.mysqlconn.execute_single('call Calc_balance(%s, %s)',
                                              (end_date.strftime('%Y-%m-%d'), str(tp.value)))
                # 获取当前余额
                curr_bal_df = self.load_from_mysql('core', 't_CMMS_TEMP_DAYBAL_T').select(
                    'CUST_NO', 'ACCT_NO15', 'CURR_BAL', 'TIME', 'CUR', 'CURR_DATE', 'BUSI_TYPE'
                )

                curr_bal_rdd = curr_bal_df.filter(curr_bal_df.BUSI_TYPE == tp.value).map(change_row)
                print(curr_bal_rdd.take(1))
                if curr_bal_rdd.count() == 0:
                    self.logger.warning('rdd is empty')
                    continue
                if os.system('hadoop fs -test -e ' + hdfs_path) == 0:
                    self.logger.warning('%s is already existed, deleting' % hdfs_path)
                    if os.system('hadoop fs -rm -r ' + hdfs_path) == 0:
                        self.logger.info('delete %s successful' % hdfs_path)
                # 保存当前余额信息到hdfs上以便合并
                self.logger.info('save rdd context to %s' % hdfs_path)
                curr_bal_rdd.saveAsTextFile(hdfs_path)
                end_date += datetime.timedelta(1)
            # 将当前月移到本地进行合并
            self.logger.info('copy file from %s to %s' % (hdfs_root_path, local_temp_path))
            copy_from_hdfs_to_local(hdfs_root_path, local_temp_path)
            self.logger.info('merge file content\n\tinput dir: %s\n\toutput_file: %s' %
                             (local_dir_for_merge, local_output_file_path))
            merge_file(local_dir_for_merge, local_output_file_path)
            # 合并后的文件移回hdfs作为AUM的输入
            if os.system('hadoop fs -test -e ' + hdfs_input_path) == 1:
                self.logger.warning('hdfs dir: %s is not exist' % hdfs_input_path)
                self.logger.info('\texecute hdfs command: hadoop fs -mkdir %s' % hdfs_input_path)
                os.system('hadoop fs -mkdir ' + hdfs_input_path)
            if os.system('hadoop fs -test -e ' + hdfs_input_path + merged_file_name) == 0:
                self.logger.info('hdfs file: %s is already exist' % hdfs_input_path + merged_file_name)
                self.logger.info('\texcute hdfs command: hadoop fs -rm ' + hdfs_input_path + merged_file_name)
                os.system('hadoop fs -rm ' + hdfs_input_path + merged_file_name)
            os.system('hadoop fs -put ' + local_output_file_path + ' ' + hdfs_input_path)

            # 计算AUM
            self.logger.info('start calculate AUM of %s' % cdate.strftime('%Y-%m'))
            all_data = self.sc.textFile(hdfs_input_path + merged_file_name)

            day_bal = all_data.map(lambda line: line.strip().split(',')) \
                .map(lambda fields: (fields[0].strip()[1:len(fields[0].strip()) - 1],  # CUST_NO
                                     fields[1].strip()[1:len(fields[1].strip()) - 1],  # ACCT_NO15
                                     float(fields[2].strip()[1:len(fields[2].strip()) - 1]),  # CURR_BAL
                                     1,  # TIME
                                     fields[4].strip()[1:len(fields[4].strip()) - 1],  # CUR
                                     fields[5].strip()[1:len(fields[5].strip()) - 1],  # CURR_DATE
                                     fields[6].strip()[1:len(fields[6].strip()) - 1],  # BUSI_TYPE
                                     ))
            # 用ACCT_NO15位主键,并累加余额和次数,然后在计算日均余额
            add_bal = day_bal.map(lambda fields: (fields[1], fields)) \
                .reduceByKey(lambda a, b: (
            a[0], a[1], float(float(a[2]) + float(b[2])), int(a[3]) + int(b[3]), a[4], max(a[5], b[5]), a[6])) \
                .map(calc_avg)
            # 判断保存目录路径是否存在,若存在,则将其删除,否则savaAsTexstFile操作会报错
            if os.system('hadoop fs -test -e ' + hdfs_save_path) == 0:
                os.system('hadoop fs -rm -r ' + hdfs_save_path)
            # add_bal.cache()
            add_bal.saveAsTextFile(hdfs_save_path)

            os.system('hadoop fs -getmerge %s %s%s' % (hdfs_save_path, local_result_path, result_file_name))
            os.system('hadoop fs -put %s%s %s' % (local_result_path, result_file_name,
                                                  hdfs_input_path))

            result_rdd = self.sc.textFile(hdfs_input_path + result_file_name)
            mapped_rdd = result_rdd.map(lambda line: line[1: len(line) - 1]) \
                .map(lambda line: line.split(',')) \
                .map(lambda fields: (fields[0].strip()[1:len(fields[0].strip()) - 1],
                                     fields[1].strip()[1:len(fields[1].strip()) - 1],  # ACCT_NO15
                                     float(fields[2].strip()[0:len(fields[2].strip())]),  # AUM
                                     fields[3].strip()[1:len(fields[3].strip()) - 1],  # CUR
                                     fields[4].strip()[1:len(fields[4].strip()) - 1],  # STAT_DAT
                                     fields[5].strip()[1:len(fields[5].strip()) - 1],  # ASS_TYPE
                                     ))
            # 重新创建对应的DataFrame并创建对应的writer
            writer = DataFrameWriter(self.sql_context.createDataFrame(mapped_rdd, ['CUST_NO', 'ACC_NO15', 'AUM', 'CUR',
                                                                                   'STAT_DAT', 'ASS_TYPE']))
            # self.mysql_url = 'jdbc:mysql://10.9.29.212:3306/core?user=root&characterEncoding=UTF-8'
            # 将DF中的内容以指定的方式保存到指定的表中
            writer.jdbc(self.mysql_url, table=target_save_table, mode=save_mode)
        except ValueError as e:
            self.logger.error(e)
            exit(-1)