def write_to_db(df, table, mode='append', database='project_db', schema='public'): db_property = connect_to_db(database) #Write to database writer = DataFrameWriter(df) writer.jdbc(db_property['url'], table, mode, db_property)
def write(self, df, table, md): """ Args: df: pandas.dataframe table: str md: str :rtype: None """ writer = DataFrameWriter(df) writer.jdbc(self.url_connect, table, md, self.properties)
def write(self, df, table_name, mode): """ Args: df: spark.dataframe table_name: str moded: str :rtype: None """ writer = DataFrameWriter(df) writer.jdbc(self.url_connect, table_name, mode, self.properties)
def write_jdbc(self, df, table, mode): writer = DataFrameWriter(df) url = self.cf.get('mysql', 'url') properties = { 'user': self.cf.get('mysql', 'user'), 'password': self.cf.get('mysql', 'password'), 'driver': self.cf.get('mysql', 'driver') } writer.jdbc(url, table, mode, properties)
def format_save(writer: DataFrameWriter, data_source: DataSource) -> None: if data_source.data_format is DataFormat.CSV: writer.option("header", True).csv(data_source.source_path) if data_source.data_format is DataFormat.PARQUET: writer.parquet(data_source.source_path) if data_source.data_format is DataFormat.JSON: writer.json(data_source.source_path) if data_source.data_format is DataFormat.JDBC: if type(data_source) is JDBCDataSource: ## TODO writer.jdbc()
def write_to_DB(df, table_name): url = 'jdbc:postgresql://{}/{}'.format(config['postgres_conn']['host'], config['postgres_conn']['database']) properties = { 'user': config['postgres_conn']['user'], 'password': config['postgres_conn']['password'], "driver": "org.postgresql.Driver" } #tablename = 'events' writer = DataFrameWriter(df) writer.jdbc(url=url, table=table_name, properties=properties, mode='append')
def stat_act_time(self): """ 这里是统计每张卡每个月每种业务(定、活、理)办理的次数 :return: """ target_table = 't_CMMS_TEMP_ACTTIME' save_mode = 'append' acc_detail_source = self.sc.textFile('%s/jjzhu/test/input/t_CMMS_ACCOUNT_DETAIL.txt' % self.hdfs_root_url) # 这里是更新t_CMMS_TEMP_ACTTIME表的 acc_list = self.load_from_mysql('core', 't_CMMS_ACCOUNT_LIST') \ .map(lambda row: (row.asDict()['ACC_NO15'], (row.asDict()['CUST_NO'], row.asDict()['ACC_NAM']))).distinct() acc_list.cache() # 添加缓存 split_rdd = acc_detail_source.map(lambda line: line.split(',')) split_rdd.cache() # 添加缓存 start_time = '2016-02-01' # TODO 之后得改成最早的时间 end_time = '2016-08-01' # TODO 最晚的时间 time_slots = self.get_time_slot(start_time, end_time) for slot in time_slots: self.logger.info('statistic action time of %s' % slot[0][0: slot[0].rindex('-')]) # 以 客户号+业务类型为key,统计客户不同类型的交易次数 act_time = split_rdd.filter(lambda columns: columns[1] <= slot[1]) \ .filter(lambda columns: columns[1] >= slot[0]) \ .map(lambda columns: (columns[3] + '_' + columns[6], 1)) \ .reduceByKey(lambda a, b: a + b) mapped_act_time = act_time.map(lambda fields: (fields[0].split('_')[0], (fields[0].split('_')[0], fields[0].split('_')[1], fields[1], slot[0][0: slot[0].rindex('-')]))) # join操作,连接客户号对应的具体信息 result = mapped_act_time.join(acc_list).map(lambda fields: fields[1][0] + fields[1][1]) # ACCT_NO15, BUST_TYPE, ACT_TIME, CURR_MONTH, CUST_NO, ACCT_NAM # '101006463122653', '1', 25, '2016-02-01', '81024082971', '曹镇颜' result_writer = DataFrameWriter( self.sql_context.createDataFrame(result, ['ACCT_NO15', 'BUST_TYPE', 'ACT_TIME', 'CURR_MONTH', 'CUST_NO', 'ACCT_NAM'])) self.logger.info('save the statistic result into mysql\n' 'url: %s\n' 'target table: %s\n' 'mode: %s' % (self.mysql_url, target_table, save_mode)) result_writer.jdbc(self.mysql_url, table=target_table, mode=save_mode)
def init_balance(self, ctime, tp): """ 处理流程: 方法接收 格式为'%Y-%m-%d'的ctime和业务类型tp while 当前月: 调用存储过程 Calc_balance(该存储过程更新当前日期的余额) 将当前余额保存到hdfs中 天数+1 将当前月每一天的余额复制到本地文件中进行合并xxxx_xx.data 将xxxx_xx.data合并文件移到hdfs中去 读取该文件,进行日均余额的计算,并保存结果到hdfs中 将结果合并至一个文件xxxx_xx-r.data 加载结果合并文件xxxx_xx-r.data,导入mysql数据库 :param ctime:格式为'%Y-%m-%d'的ctime :param tp: 业务类型 :return: """ try: cdate = datetime.datetime.strptime(ctime, '%Y-%m-%d') end_date = cdate # 合并后的文件名为 年份_月份.data merged_file_name = '%s.data' % cdate.strftime('%Y_%m') result_file_name = '%s-r.data' % cdate.strftime('%Y_%m') # hdfs输入路径 hdfs_root_path = '%s/jjzhu/test/_%s/%s/' % (self.hdfs_root_url, str(tp.value), cdate.strftime('%Y_%m')) # 本地的临时目录路径 local_temp_path = '%s/temp/_%s/' % (self.local_root_url, str(tp.value)) def exist_or_create(path): if not os.path.exists(path): self.logger.warning('local path: %s is not exist' % path) self.logger.info('creating dir: %s' % path) os.makedirs(path) self.logger.info('local path: %s is already existed' % path) exist_or_create(local_temp_path) # 本地合并后的文件的输出路径 local_output_path = '%s/output/_%s/' % (self.local_root_url, str(tp.value)) exist_or_create(local_output_path) local_result_path = '%s/result/_%s/' % (self.local_root_url, str(tp.value)) exist_or_create(local_result_path) # AUM计算结果的hdfs输出路径 hdfs_save_path = '%s/jjzhu/test/result/_%s/%s' % (self.hdfs_root_url, str(tp.value), cdate.strftime('%Y_%m')) # 计算AUM原文件的hdfs输入路径 hdfs_input_path = '%s/jjzhu/test/input/_%s/' % (self.hdfs_root_url, str(tp.value)) if os.system('hadoop fs -mkdir %s' % hdfs_input_path) == 0: self.logger.warning('%s is not exist, created it successful' % hdfs_input_path) target_save_table = 't_CMMS_ASSLIB_ASSET_c' # TODO save_mode = 'append' local_output_file_path = local_output_path + merged_file_name local_dir_for_merge = local_temp_path + cdate.strftime('%Y_%m') # Row()转换成tuple def change_row(row): row_dict = row.asDict() return (row_dict['CUST_NO'], row_dict['ACCT_NO15'], str(row_dict['CURR_BAL']), str(row_dict['TIME']), row_dict['CUR'], row_dict['CURR_DATE'].strftime('%Y-%m-%d'), str(row_dict['BUSI_TYPE'])) def calc_avg(el): r = el[1] return (r[0], r[1], float(r[2]) / int(r[3]), r[4], datetime.datetime.strptime(str(r[5]), '%Y-%m-%d').strftime('%Y-%m'), r[6]) # 将hdfs上每个月的数据移到本地 def copy_from_hdfs_to_local(hdfs_dir, local_dir): import sys if not os.path.exists(local_dir): self.logger.warning('%s is not existed, create it first.' % local_dir) os.system('mkdir ' + local_dir) # exit(-1) shell_command = 'hadoop fs -copyToLocal %s %s' % (hdfs_dir, local_dir) self.logger.info('execute hdfs shell command: %s' % shell_command) os.system(shell_command) # 合并文件 def merge_file(input_dir, output_path): import platform if output_path.__contains__('/data'): output_file = open(output_path, 'w+') if platform.system() == 'Windows': deli = '\\' elif platform.system() == 'Linux': deli = '/' else: self.logger.error('unknown platform: %s' % platform.system()) for i in os.listdir(input_dir): curr_dir = input_dir + deli + i for j in os.listdir(curr_dir): if j.startswith('part'): # 只合并以part开头的输出文件 with open(curr_dir + deli + j) as f: # print(curr_dir+deli+j) for line in f.readlines(): # print(line[1:len(line)-2]) output_file.write(line[1:len(line) - 2] + '\n') output_file.close() else: self.logger.error('please make sure your input path is under /data') exit(-1) while end_date.month == cdate.month: # print(end_date) hdfs_path = hdfs_root_path + end_date.strftime('%Y_%m_%d') # 调用mysql存储过程,更新当前余额 self.logger.info('call Calc_balance(%s, %s)' % (end_date.strftime('%Y-%m-%d'), str(tp.value))) self.mysqlconn.execute_single('call Calc_balance(%s, %s)', (end_date.strftime('%Y-%m-%d'), str(tp.value))) # 获取当前余额 curr_bal_df = self.load_from_mysql('core', 't_CMMS_TEMP_DAYBAL_T').select( 'CUST_NO', 'ACCT_NO15', 'CURR_BAL', 'TIME', 'CUR', 'CURR_DATE', 'BUSI_TYPE' ) curr_bal_rdd = curr_bal_df.filter(curr_bal_df.BUSI_TYPE == tp.value).map(change_row) print(curr_bal_rdd.take(1)) if curr_bal_rdd.count() == 0: self.logger.warning('rdd is empty') continue if os.system('hadoop fs -test -e ' + hdfs_path) == 0: self.logger.warning('%s is already existed, deleting' % hdfs_path) if os.system('hadoop fs -rm -r ' + hdfs_path) == 0: self.logger.info('delete %s successful' % hdfs_path) # 保存当前余额信息到hdfs上以便合并 self.logger.info('save rdd context to %s' % hdfs_path) curr_bal_rdd.saveAsTextFile(hdfs_path) end_date += datetime.timedelta(1) # 将当前月移到本地进行合并 self.logger.info('copy file from %s to %s' % (hdfs_root_path, local_temp_path)) copy_from_hdfs_to_local(hdfs_root_path, local_temp_path) self.logger.info('merge file content\n\tinput dir: %s\n\toutput_file: %s' % (local_dir_for_merge, local_output_file_path)) merge_file(local_dir_for_merge, local_output_file_path) # 合并后的文件移回hdfs作为AUM的输入 if os.system('hadoop fs -test -e ' + hdfs_input_path) == 1: self.logger.warning('hdfs dir: %s is not exist' % hdfs_input_path) self.logger.info('\texecute hdfs command: hadoop fs -mkdir %s' % hdfs_input_path) os.system('hadoop fs -mkdir ' + hdfs_input_path) if os.system('hadoop fs -test -e ' + hdfs_input_path + merged_file_name) == 0: self.logger.info('hdfs file: %s is already exist' % hdfs_input_path + merged_file_name) self.logger.info('\texcute hdfs command: hadoop fs -rm ' + hdfs_input_path + merged_file_name) os.system('hadoop fs -rm ' + hdfs_input_path + merged_file_name) os.system('hadoop fs -put ' + local_output_file_path + ' ' + hdfs_input_path) # 计算AUM self.logger.info('start calculate AUM of %s' % cdate.strftime('%Y-%m')) all_data = self.sc.textFile(hdfs_input_path + merged_file_name) day_bal = all_data.map(lambda line: line.strip().split(',')) \ .map(lambda fields: (fields[0].strip()[1:len(fields[0].strip()) - 1], # CUST_NO fields[1].strip()[1:len(fields[1].strip()) - 1], # ACCT_NO15 float(fields[2].strip()[1:len(fields[2].strip()) - 1]), # CURR_BAL 1, # TIME fields[4].strip()[1:len(fields[4].strip()) - 1], # CUR fields[5].strip()[1:len(fields[5].strip()) - 1], # CURR_DATE fields[6].strip()[1:len(fields[6].strip()) - 1], # BUSI_TYPE )) # 用ACCT_NO15位主键,并累加余额和次数,然后在计算日均余额 add_bal = day_bal.map(lambda fields: (fields[1], fields)) \ .reduceByKey(lambda a, b: ( a[0], a[1], float(float(a[2]) + float(b[2])), int(a[3]) + int(b[3]), a[4], max(a[5], b[5]), a[6])) \ .map(calc_avg) # 判断保存目录路径是否存在,若存在,则将其删除,否则savaAsTexstFile操作会报错 if os.system('hadoop fs -test -e ' + hdfs_save_path) == 0: os.system('hadoop fs -rm -r ' + hdfs_save_path) # add_bal.cache() add_bal.saveAsTextFile(hdfs_save_path) os.system('hadoop fs -getmerge %s %s%s' % (hdfs_save_path, local_result_path, result_file_name)) os.system('hadoop fs -put %s%s %s' % (local_result_path, result_file_name, hdfs_input_path)) result_rdd = self.sc.textFile(hdfs_input_path + result_file_name) mapped_rdd = result_rdd.map(lambda line: line[1: len(line) - 1]) \ .map(lambda line: line.split(',')) \ .map(lambda fields: (fields[0].strip()[1:len(fields[0].strip()) - 1], fields[1].strip()[1:len(fields[1].strip()) - 1], # ACCT_NO15 float(fields[2].strip()[0:len(fields[2].strip())]), # AUM fields[3].strip()[1:len(fields[3].strip()) - 1], # CUR fields[4].strip()[1:len(fields[4].strip()) - 1], # STAT_DAT fields[5].strip()[1:len(fields[5].strip()) - 1], # ASS_TYPE )) # 重新创建对应的DataFrame并创建对应的writer writer = DataFrameWriter(self.sql_context.createDataFrame(mapped_rdd, ['CUST_NO', 'ACC_NO15', 'AUM', 'CUR', 'STAT_DAT', 'ASS_TYPE'])) # self.mysql_url = 'jdbc:mysql://10.9.29.212:3306/core?user=root&characterEncoding=UTF-8' # 将DF中的内容以指定的方式保存到指定的表中 writer.jdbc(self.mysql_url, table=target_save_table, mode=save_mode) except ValueError as e: self.logger.error(e) exit(-1)
def write(self, df, table, mode='append'): writer = DataFrameWriter(df) writer.jdbc(self.url_connect, table, mode, self.properties)
from pyspark import SparkConf from pyspark.sql import SQLContext # spark context sc=SparkContext() #sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", "XXXX") #sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", "XXX") # SparkSQL context sqlContext = SQLContext(sc) filename = "RC_2006-01.parquet" destination="s3a://reddit-comments-raw/2006/comments_2006_05.parquet" data = sqlContext.read.parquet(destination) print("done",data.count()) from pyspark.sql import DataFrameWriter my_writer = DataFrameWriter(data) md="overwrite" database_name = 'reddit' hostname = 'ec2-54-214-117-182.us-west-2.compute.amazonaws.com' url_connect = "jdbc:postgresql://{hostname}:5000/{db}".format(hostname=hostname, db=database_name) properties = {"user":"******","password" : "webuiuser","driver": "org.postgresql.Driver"} table="word_count" my_writer.jdbc(url=url_connect,table= table,mode=md,properties=properties)