def _upload_to_hive(self, logs): sql_context = HiveContext() # update Hive table df_writer = DataFrameWriter(logs) df_writer.insertInto(self.destination_table, overwrite=True)
def create_VP_tables(self): print "Beginning the creation of VP tables." total_properties = len(self.properties) i = 0 # for each distinct property, create a table for p in self.properties: i += 1 prop_df = self.sqlContext.sql( "SELECT s AS s, o AS o FROM tripletable WHERE p='" + p + "'") df_writer = DataFrameWriter(prop_df) df_writer.saveAsTable("VP_" + valid_string(p)) sys.stdout.write("\rTables created: %d / %d " % (i, total_properties)) # if statistics are enabled, compute them if self.statsEnabled: i = 0 stat = Stats() for p in self.properties: i += 1 tableDF = self.sqlContext.sql("SELECT * FROM VP_" + valid_string(p)) stat.addTableStat(p, tableDF) sys.stdout.write("\rStatistics created: %d / %d " % (i, total_properties)) with open(self.statsFile, "w") as f: f.write(stat.getSerializedStats()) print "Statistics created: %d / %d " % (i, total_properties)
def write_to_db(df, table, mode='append', database='project_db', schema='public'): db_property = connect_to_db(database) #Write to database writer = DataFrameWriter(df) writer.jdbc(db_property['url'], table, mode, db_property)
def write(self, df, table, md): """ Args: df: pandas.dataframe table: str md: str :rtype: None """ writer = DataFrameWriter(df) writer.jdbc(self.url_connect, table, md, self.properties)
def write_jdbc(self, df, table, mode): writer = DataFrameWriter(df) url = self.cf.get('mysql', 'url') properties = { 'user': self.cf.get('mysql', 'user'), 'password': self.cf.get('mysql', 'password'), 'driver': self.cf.get('mysql', 'driver') } writer.jdbc(url, table, mode, properties)
def write(self, df, table_name, mode): """ Args: df: spark.dataframe table_name: str moded: str :rtype: None """ writer = DataFrameWriter(df) writer.jdbc(self.url_connect, table_name, mode, self.properties)
def main(): sc = SparkContext() hc = HiveContext(sc) df = hc.sql("""{{sql}}""") df_writer = DataFrameWriter(df) df_writer.saveAsTable(name='{{tableName}}', format='json', mode='overwrite', path='s3://data/{{tableName}}')
def write_to_DB(df, table_name): url = 'jdbc:postgresql://{}/{}'.format(config['postgres_conn']['host'], config['postgres_conn']['database']) properties = { 'user': config['postgres_conn']['user'], 'password': config['postgres_conn']['password'], "driver": "org.postgresql.Driver" } #tablename = 'events' writer = DataFrameWriter(df) writer.jdbc(url=url, table=table_name, properties=properties, mode='append')
def saveToHive(spark,out_rdd,out_table): schema = StructType([StructField("serial_number_pre", StringType(), True), StructField("pcb_number_pre", StringType(), True), StructField("serial_number_tmp", StringType(), True), StructField("chain", StringType(), True)]) df = spark.createDataFrame(out_rdd,schema) DataFrameWriter(df).insertInto(out_table)
def write_to_postgres(df, table, mode): DataFrameWriter(df).jdbc( POSTGRES_URL, table, mode, { 'user': POSTGRES_USER, 'password': POSTGRES_PWD, 'driver': 'org.postgresql.Driver' })
def format_save(writer: DataFrameWriter, data_source: DataSource) -> None: if data_source.data_format is DataFormat.CSV: writer.option("header", True).csv(data_source.source_path) if data_source.data_format is DataFormat.PARQUET: writer.parquet(data_source.source_path) if data_source.data_format is DataFormat.JSON: writer.json(data_source.source_path) if data_source.data_format is DataFormat.JDBC: if type(data_source) is JDBCDataSource: ## TODO writer.jdbc()
def stat_act_time(self): """ 这里是统计每张卡每个月每种业务(定、活、理)办理的次数 :return: """ target_table = 't_CMMS_TEMP_ACTTIME' save_mode = 'append' acc_detail_source = self.sc.textFile('%s/jjzhu/test/input/t_CMMS_ACCOUNT_DETAIL.txt' % self.hdfs_root_url) # 这里是更新t_CMMS_TEMP_ACTTIME表的 acc_list = self.load_from_mysql('core', 't_CMMS_ACCOUNT_LIST') \ .map(lambda row: (row.asDict()['ACC_NO15'], (row.asDict()['CUST_NO'], row.asDict()['ACC_NAM']))).distinct() acc_list.cache() # 添加缓存 split_rdd = acc_detail_source.map(lambda line: line.split(',')) split_rdd.cache() # 添加缓存 start_time = '2016-02-01' # TODO 之后得改成最早的时间 end_time = '2016-08-01' # TODO 最晚的时间 time_slots = self.get_time_slot(start_time, end_time) for slot in time_slots: self.logger.info('statistic action time of %s' % slot[0][0: slot[0].rindex('-')]) # 以 客户号+业务类型为key,统计客户不同类型的交易次数 act_time = split_rdd.filter(lambda columns: columns[1] <= slot[1]) \ .filter(lambda columns: columns[1] >= slot[0]) \ .map(lambda columns: (columns[3] + '_' + columns[6], 1)) \ .reduceByKey(lambda a, b: a + b) mapped_act_time = act_time.map(lambda fields: (fields[0].split('_')[0], (fields[0].split('_')[0], fields[0].split('_')[1], fields[1], slot[0][0: slot[0].rindex('-')]))) # join操作,连接客户号对应的具体信息 result = mapped_act_time.join(acc_list).map(lambda fields: fields[1][0] + fields[1][1]) # ACCT_NO15, BUST_TYPE, ACT_TIME, CURR_MONTH, CUST_NO, ACCT_NAM # '101006463122653', '1', 25, '2016-02-01', '81024082971', '曹镇颜' result_writer = DataFrameWriter( self.sql_context.createDataFrame(result, ['ACCT_NO15', 'BUST_TYPE', 'ACT_TIME', 'CURR_MONTH', 'CUST_NO', 'ACCT_NAM'])) self.logger.info('save the statistic result into mysql\n' 'url: %s\n' 'target table: %s\n' 'mode: %s' % (self.mysql_url, target_table, save_mode)) result_writer.jdbc(self.mysql_url, table=target_table, mode=save_mode)
def write_dataframe_to_postgres2(df, table, mode): df = df.withColumn('listing_id', df.listing_id.cast('INT')) df = df.withColumn('id', df.id.cast('INT')) df = df.withColumn('date', Func.to_timestamp(df.date, format='YYYY-MM-DD')) df = df.withColumn('reviewer_id', df.reviewer_id.cast('INT')) df = df.withColumn('reviewer_name', df.reviewer_name.cast('STRING')) df = df.withColumn('comments', df.comments.cast('STRING')) #df = df.withColumn('tone', 0.0) DataFrameWriter(df).jdbc(POSTGRES_URL, table, mode, { 'user': POSTGRES_USER, 'password': POSTGRES_PWD, 'driver': 'org.postgresql.Driver' })
def write_dataframe_to_postgres(df, table, mode): df = df.withColumn('incident_datetime', Func.to_timestamp(df.incident_datetime, format='YYYY/MM/DD HH24:MI:SS PM')) df = df.withColumn('incident_date', Func.to_timestamp(df.incident_date, format='YYYY/MM/DD')) df = df.withColumn('incident_time', Func.to_timestamp(df.incident_time, format='HH24:MI')) df = df.withColumn('incident_year', df.incident_year.cast('INT')) df = df.withColumn('incident_day_of_week', df.incident_day_of_week.cast('STRING')) df = df.withColumn('incident_id', df.incident_id.cast('INT')) df = df.withColumn('incident_category', df.incident_category.cast('STRING')) df = df.withColumn('incident_subcategory', df.incident_subcategory.cast('STRING')) df = df.withColumn('incident_description', df.incident_description.cast('STRING')) df = df.withColumn('resolution', df.resolution.cast('STRING')) df = df.withColumn('analysis_neighborhood', df.analysis_neighborhood.cast('STRING')) df = df.withColumn('latitude', df.latitude.cast('FLOAT')) df = df.withColumn('longitude', df.longitude.cast('FLOAT')) DataFrameWriter(df).jdbc(POSTGRES_URL, table, mode, { 'user': POSTGRES_USER, 'password': POSTGRES_PWD, 'driver': 'org.postgresql.Driver' })
def save_df(df, day_of_week, passenger_count): storage_dir = '/hadoop/cms/store/user/jgran/taxi/saved_dataframes/' name = 'df_'+day_of_week+'_'+passenger_count #df_writer = DataFrameWriter(df.coalesce(50)) df_writer = DataFrameWriter(df) df_writer.parquet(storage_dir+name)
def get_writer(self, df): return DataFrameWriter(df)
from pyspark.sql import DataFrameWriter def write_df_to_table(df_writer, table): jdbc_url = "jdbc:postgresql://0.0.0.0:5432/postgres" mode = "overwrite" properties = { "user": "******", "password": "******", "driver": "org.postgresql.Driver" } print("Writing to {}".format(table)) df_writer.jdbc(jdbc_url, table, mode, properties) # Write df2 write_df_to_table(DataFrameWriter(df2), "stock_data") # Write monthly average close write_df_to_table(DataFrameWriter(monthly_avg_close_df), "avg_month_close") # Write adj close diff than close count write_df_to_table(DataFrameWriter(adj_close_diff_than_close), "adjusted_close_count") print("DB Write complete") print("Complete")
def saveToHive(hiveContext, rddData, out_table): hiveContext.sql("truncate table " + out_table) df = hiveContext.createDataFrame( rddData, ["defectid", "test_tools", "loaded_timestamp"]) DataFrameWriter(df).insertInto(out_table) logging.info("Test tool predict finished successed.")
from pyspark import SparkConf from pyspark.sql import SQLContext # spark context sc=SparkContext() #sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", "XXXX") #sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", "XXX") # SparkSQL context sqlContext = SQLContext(sc) filename = "RC_2006-01.parquet" destination="s3a://reddit-comments-raw/2006/comments_2006_05.parquet" data = sqlContext.read.parquet(destination) print("done",data.count()) from pyspark.sql import DataFrameWriter my_writer = DataFrameWriter(data) md="overwrite" database_name = 'reddit' hostname = 'ec2-54-214-117-182.us-west-2.compute.amazonaws.com' url_connect = "jdbc:postgresql://{hostname}:5000/{db}".format(hostname=hostname, db=database_name) properties = {"user":"******","password" : "webuiuser","driver": "org.postgresql.Driver"} table="word_count" my_writer.jdbc(url=url_connect,table= table,mode=md,properties=properties)
def save_df(df, day_of_week, passenger_count): storage_dir = '/hadoop/cms/store/user/jgran/taxi/saved_dataframes/' name = 'df_' + day_of_week + '_' + passenger_count #df_writer = DataFrameWriter(df.coalesce(50)) df_writer = DataFrameWriter(df) df_writer.parquet(storage_dir + name)
phone = ob.get("bankpremobile", None) else: phone = None return (phone, idCard, idBank, name) # xiaoshudian_app_key = "1186159692" # xiaoshudian_tec_api = ["tel","address_getbymobile","channel_NameIDCardAccountVerify","channel_cellphone","operator_capricorn","address_match","channel_idcard","channel_bankby3","channel_idNameFase","channel_criminal","channel_blacklistverify","credit_implement"] # def filter(app_key,api): # not (app_key in xiaoshudian_app_key and api not in xiaoshudian_tec_api) data_rdd = data.rdd.map(lambda a: (a.app_key_param, a.date, standard_params(a.params), a.interface, a.api_type)) \ .map(lambda (a, b, c, d, e): (a, b, c[0], c[1], c[2], c[3], d, e)) ''' c[0]->phone c[1]->idcard c[2]->idbank c[3]->name ''' schemaStr = "app_key date phone idcard idbank name interface api_type" fields = [ StructField(field_name, StringType(), True) for field_name in schemaStr.split() ] schema = StructType(fields) data_df = hc.createDataFrame(data_rdd, schema).distinct() dfw = DataFrameWriter(data_df) dfw.saveAsTable("wl_analysis.t_lel_record_data_backflow", mode="overwrite")
def saveToHive(hiveContext,rddData,out_table): hiveContext.sql("truncate table "+out_table) df = hiveContext.createDataFrame(rddData, ["defectid", "categories","loaded_timestamp"]) DataFrameWriter(df).insertInto(out_table)
lambda a, b: a + b).map(lambda x: tfin(x)) # In[27]: #rdd2.take(10) # In[23]: tdf2 = hiveContext.createDataFrame(rdd2, ['uid', 'loc_ts_dur', 'data_dt']) # In[24]: #tdf2.collect() # In[25]: df_writer = DataFrameWriter(tdf2) df_writer.insertInto(oHiveTable, overwrite=True) # In[26]: sc.stop() # In[31]: #toLocDurTuples('101,2016-06-01@12:04:02,[40:50][202:203],20160601') # In[32]: #tfin(('101_20160601', [(202, 1464782642, 40), (203, 1464782682, 50)]))
indices_to_terms(model.vocabulary)("termIndices")) #ti.persist(StorageLevel.MEMORY_AND_DISK) ti = ti.withColumn('topicString', parseInts(ti["topic"])) ti = ti.withColumn('termWeightsString', parseThetas(ti["termWeights"])) ti = ti.withColumn('topics_wordsString', parseTerms(ti["topics_words"])) #print(ti.columns) #save file - terms per topic #f = open(savePath + 'terms.csv','w') #save file - topic distribution and terms writer = DataFrameWriter(df2.select('id', 'thetaString')) writer.csv('file://' + savePath + thisSet + '.thetas') writer = DataFrameWriter( ti.select('topicString', 'termWeightsString', 'topics_wordsString')) writer.csv('file://' + savePath + thisSet + '.terms') """ #save file - terms per topic f = open(savePath + thisSet + '.terms.csv','w') for row in ti2: items = len(row[3]) for i in range(items): f.write(str(row[0]) + ',' + row[3][i].encode('utf-8') + ',' + str(row[2][i]) + '\n')
def _aliased_writer(df_writer: DataFrameWriter, format_key: str, path: Optional[str], **options: str): """ Saves the dataframe to a file of the given type at the given path.""" return df_writer.format(format_key).save(path, **options)
def save_df(self, path, df): self.delete_from_hdfs(path) writer = DataFrameWriter(df) writer.save(path)
if ob.has_key("mobile"): phone = ob.get("mobile", None) elif ob.has_key("phone"): phone = ob.get("phone", None) elif ob.has_key("enc_m"): phone = ob.get("enc_m", None) elif ob.has_key("ownerMobile"): phone = ob.get("ownerMobile", None) elif ob.has_key("bankpremobile"): phone = ob.get("bankpremobile", None) else: phone = None return (phone, idCard, idBank, name) # xiaoshudian_app_key = "1186159692" # xiaoshudian_tec_api = ["tel","address_getbymobile","channel_NameIDCardAccountVerify","channel_cellphone","operator_capricorn","address_match","channel_idcard","channel_bankby3","channel_idNameFase","channel_criminal","channel_blacklistverify","credit_implement"] # def filter(app_key,api): # not (app_key in xiaoshudian_app_key and api not in xiaoshudian_tec_api) data_rdd = data.rdd.map(lambda a:(a.app_key_param,a.date,standard_params(a.params_less),a.interface,a.api_type))\ .map(lambda (a,b,c,d,e):(a,b,c[0],c[1],c[2],c[3],d,e)) schemaStr = "app_key date phone idcard idbank name interface api_type" fields = [ StructField(field_name, StringType(), True) for field_name in schemaStr.split() ] schema = StructType(fields) data_df = sqlContext.createDataFrame(data_rdd, schema).distinct() dfw = DataFrameWriter(data_df) dfw.saveAsTable("wl_analysis.t_lel_datamart_backflow_filtered", mode="overwrite")
def write(self, df, table, mode='append'): writer = DataFrameWriter(df) writer.jdbc(self.url_connect, table, mode, self.properties)
def init_balance(self, ctime, tp): """ 处理流程: 方法接收 格式为'%Y-%m-%d'的ctime和业务类型tp while 当前月: 调用存储过程 Calc_balance(该存储过程更新当前日期的余额) 将当前余额保存到hdfs中 天数+1 将当前月每一天的余额复制到本地文件中进行合并xxxx_xx.data 将xxxx_xx.data合并文件移到hdfs中去 读取该文件,进行日均余额的计算,并保存结果到hdfs中 将结果合并至一个文件xxxx_xx-r.data 加载结果合并文件xxxx_xx-r.data,导入mysql数据库 :param ctime:格式为'%Y-%m-%d'的ctime :param tp: 业务类型 :return: """ try: cdate = datetime.datetime.strptime(ctime, '%Y-%m-%d') end_date = cdate # 合并后的文件名为 年份_月份.data merged_file_name = '%s.data' % cdate.strftime('%Y_%m') result_file_name = '%s-r.data' % cdate.strftime('%Y_%m') # hdfs输入路径 hdfs_root_path = '%s/jjzhu/test/_%s/%s/' % (self.hdfs_root_url, str(tp.value), cdate.strftime('%Y_%m')) # 本地的临时目录路径 local_temp_path = '%s/temp/_%s/' % (self.local_root_url, str(tp.value)) def exist_or_create(path): if not os.path.exists(path): self.logger.warning('local path: %s is not exist' % path) self.logger.info('creating dir: %s' % path) os.makedirs(path) self.logger.info('local path: %s is already existed' % path) exist_or_create(local_temp_path) # 本地合并后的文件的输出路径 local_output_path = '%s/output/_%s/' % (self.local_root_url, str(tp.value)) exist_or_create(local_output_path) local_result_path = '%s/result/_%s/' % (self.local_root_url, str(tp.value)) exist_or_create(local_result_path) # AUM计算结果的hdfs输出路径 hdfs_save_path = '%s/jjzhu/test/result/_%s/%s' % (self.hdfs_root_url, str(tp.value), cdate.strftime('%Y_%m')) # 计算AUM原文件的hdfs输入路径 hdfs_input_path = '%s/jjzhu/test/input/_%s/' % (self.hdfs_root_url, str(tp.value)) if os.system('hadoop fs -mkdir %s' % hdfs_input_path) == 0: self.logger.warning('%s is not exist, created it successful' % hdfs_input_path) target_save_table = 't_CMMS_ASSLIB_ASSET_c' # TODO save_mode = 'append' local_output_file_path = local_output_path + merged_file_name local_dir_for_merge = local_temp_path + cdate.strftime('%Y_%m') # Row()转换成tuple def change_row(row): row_dict = row.asDict() return (row_dict['CUST_NO'], row_dict['ACCT_NO15'], str(row_dict['CURR_BAL']), str(row_dict['TIME']), row_dict['CUR'], row_dict['CURR_DATE'].strftime('%Y-%m-%d'), str(row_dict['BUSI_TYPE'])) def calc_avg(el): r = el[1] return (r[0], r[1], float(r[2]) / int(r[3]), r[4], datetime.datetime.strptime(str(r[5]), '%Y-%m-%d').strftime('%Y-%m'), r[6]) # 将hdfs上每个月的数据移到本地 def copy_from_hdfs_to_local(hdfs_dir, local_dir): import sys if not os.path.exists(local_dir): self.logger.warning('%s is not existed, create it first.' % local_dir) os.system('mkdir ' + local_dir) # exit(-1) shell_command = 'hadoop fs -copyToLocal %s %s' % (hdfs_dir, local_dir) self.logger.info('execute hdfs shell command: %s' % shell_command) os.system(shell_command) # 合并文件 def merge_file(input_dir, output_path): import platform if output_path.__contains__('/data'): output_file = open(output_path, 'w+') if platform.system() == 'Windows': deli = '\\' elif platform.system() == 'Linux': deli = '/' else: self.logger.error('unknown platform: %s' % platform.system()) for i in os.listdir(input_dir): curr_dir = input_dir + deli + i for j in os.listdir(curr_dir): if j.startswith('part'): # 只合并以part开头的输出文件 with open(curr_dir + deli + j) as f: # print(curr_dir+deli+j) for line in f.readlines(): # print(line[1:len(line)-2]) output_file.write(line[1:len(line) - 2] + '\n') output_file.close() else: self.logger.error('please make sure your input path is under /data') exit(-1) while end_date.month == cdate.month: # print(end_date) hdfs_path = hdfs_root_path + end_date.strftime('%Y_%m_%d') # 调用mysql存储过程,更新当前余额 self.logger.info('call Calc_balance(%s, %s)' % (end_date.strftime('%Y-%m-%d'), str(tp.value))) self.mysqlconn.execute_single('call Calc_balance(%s, %s)', (end_date.strftime('%Y-%m-%d'), str(tp.value))) # 获取当前余额 curr_bal_df = self.load_from_mysql('core', 't_CMMS_TEMP_DAYBAL_T').select( 'CUST_NO', 'ACCT_NO15', 'CURR_BAL', 'TIME', 'CUR', 'CURR_DATE', 'BUSI_TYPE' ) curr_bal_rdd = curr_bal_df.filter(curr_bal_df.BUSI_TYPE == tp.value).map(change_row) print(curr_bal_rdd.take(1)) if curr_bal_rdd.count() == 0: self.logger.warning('rdd is empty') continue if os.system('hadoop fs -test -e ' + hdfs_path) == 0: self.logger.warning('%s is already existed, deleting' % hdfs_path) if os.system('hadoop fs -rm -r ' + hdfs_path) == 0: self.logger.info('delete %s successful' % hdfs_path) # 保存当前余额信息到hdfs上以便合并 self.logger.info('save rdd context to %s' % hdfs_path) curr_bal_rdd.saveAsTextFile(hdfs_path) end_date += datetime.timedelta(1) # 将当前月移到本地进行合并 self.logger.info('copy file from %s to %s' % (hdfs_root_path, local_temp_path)) copy_from_hdfs_to_local(hdfs_root_path, local_temp_path) self.logger.info('merge file content\n\tinput dir: %s\n\toutput_file: %s' % (local_dir_for_merge, local_output_file_path)) merge_file(local_dir_for_merge, local_output_file_path) # 合并后的文件移回hdfs作为AUM的输入 if os.system('hadoop fs -test -e ' + hdfs_input_path) == 1: self.logger.warning('hdfs dir: %s is not exist' % hdfs_input_path) self.logger.info('\texecute hdfs command: hadoop fs -mkdir %s' % hdfs_input_path) os.system('hadoop fs -mkdir ' + hdfs_input_path) if os.system('hadoop fs -test -e ' + hdfs_input_path + merged_file_name) == 0: self.logger.info('hdfs file: %s is already exist' % hdfs_input_path + merged_file_name) self.logger.info('\texcute hdfs command: hadoop fs -rm ' + hdfs_input_path + merged_file_name) os.system('hadoop fs -rm ' + hdfs_input_path + merged_file_name) os.system('hadoop fs -put ' + local_output_file_path + ' ' + hdfs_input_path) # 计算AUM self.logger.info('start calculate AUM of %s' % cdate.strftime('%Y-%m')) all_data = self.sc.textFile(hdfs_input_path + merged_file_name) day_bal = all_data.map(lambda line: line.strip().split(',')) \ .map(lambda fields: (fields[0].strip()[1:len(fields[0].strip()) - 1], # CUST_NO fields[1].strip()[1:len(fields[1].strip()) - 1], # ACCT_NO15 float(fields[2].strip()[1:len(fields[2].strip()) - 1]), # CURR_BAL 1, # TIME fields[4].strip()[1:len(fields[4].strip()) - 1], # CUR fields[5].strip()[1:len(fields[5].strip()) - 1], # CURR_DATE fields[6].strip()[1:len(fields[6].strip()) - 1], # BUSI_TYPE )) # 用ACCT_NO15位主键,并累加余额和次数,然后在计算日均余额 add_bal = day_bal.map(lambda fields: (fields[1], fields)) \ .reduceByKey(lambda a, b: ( a[0], a[1], float(float(a[2]) + float(b[2])), int(a[3]) + int(b[3]), a[4], max(a[5], b[5]), a[6])) \ .map(calc_avg) # 判断保存目录路径是否存在,若存在,则将其删除,否则savaAsTexstFile操作会报错 if os.system('hadoop fs -test -e ' + hdfs_save_path) == 0: os.system('hadoop fs -rm -r ' + hdfs_save_path) # add_bal.cache() add_bal.saveAsTextFile(hdfs_save_path) os.system('hadoop fs -getmerge %s %s%s' % (hdfs_save_path, local_result_path, result_file_name)) os.system('hadoop fs -put %s%s %s' % (local_result_path, result_file_name, hdfs_input_path)) result_rdd = self.sc.textFile(hdfs_input_path + result_file_name) mapped_rdd = result_rdd.map(lambda line: line[1: len(line) - 1]) \ .map(lambda line: line.split(',')) \ .map(lambda fields: (fields[0].strip()[1:len(fields[0].strip()) - 1], fields[1].strip()[1:len(fields[1].strip()) - 1], # ACCT_NO15 float(fields[2].strip()[0:len(fields[2].strip())]), # AUM fields[3].strip()[1:len(fields[3].strip()) - 1], # CUR fields[4].strip()[1:len(fields[4].strip()) - 1], # STAT_DAT fields[5].strip()[1:len(fields[5].strip()) - 1], # ASS_TYPE )) # 重新创建对应的DataFrame并创建对应的writer writer = DataFrameWriter(self.sql_context.createDataFrame(mapped_rdd, ['CUST_NO', 'ACC_NO15', 'AUM', 'CUR', 'STAT_DAT', 'ASS_TYPE'])) # self.mysql_url = 'jdbc:mysql://10.9.29.212:3306/core?user=root&characterEncoding=UTF-8' # 将DF中的内容以指定的方式保存到指定的表中 writer.jdbc(self.mysql_url, table=target_save_table, mode=save_mode) except ValueError as e: self.logger.error(e) exit(-1)