def write_to_db(df,
                table,
                mode='append',
                database='project_db',
                schema='public'):
    db_property = connect_to_db(database)
    #Write to database
    writer = DataFrameWriter(df)
    writer.jdbc(db_property['url'], table, mode, db_property)
 def write(self, df, table, md):
     """
     Args:
         df: pandas.dataframe
         table: str
         md: str 
     :rtype: None
     """
     writer = DataFrameWriter(df)
     writer.jdbc(self.url_connect, table, md, self.properties)
Exemple #3
0
 def write(self, df, table_name, mode):
     """
     Args:
         df: spark.dataframe
         table_name: str
         moded: str 
     :rtype: None
     """
     writer = DataFrameWriter(df)
     writer.jdbc(self.url_connect, table_name, mode, self.properties)
Exemple #4
0
    def write_jdbc(self, df, table, mode):
        writer = DataFrameWriter(df)
        url = self.cf.get('mysql', 'url')
        properties = {
            'user': self.cf.get('mysql', 'user'),
            'password': self.cf.get('mysql', 'password'),
            'driver': self.cf.get('mysql', 'driver')
        }

        writer.jdbc(url, table, mode, properties)
Exemple #5
0
def format_save(writer: DataFrameWriter, data_source: DataSource) -> None:
    if data_source.data_format is DataFormat.CSV:
        writer.option("header", True).csv(data_source.source_path)
    if data_source.data_format is DataFormat.PARQUET:
        writer.parquet(data_source.source_path)
    if data_source.data_format is DataFormat.JSON:
        writer.json(data_source.source_path)
    if data_source.data_format is DataFormat.JDBC:
        if type(data_source) is JDBCDataSource:
            ## TODO
            writer.jdbc()
Exemple #6
0
def write_to_DB(df, table_name):
    url = 'jdbc:postgresql://{}/{}'.format(config['postgres_conn']['host'],
                                           config['postgres_conn']['database'])
    properties = {
        'user': config['postgres_conn']['user'],
        'password': config['postgres_conn']['password'],
        "driver": "org.postgresql.Driver"
    }
    #tablename = 'events'
    writer = DataFrameWriter(df)
    writer.jdbc(url=url,
                table=table_name,
                properties=properties,
                mode='append')
Exemple #7
0
    def stat_act_time(self):
        """
        这里是统计每张卡每个月每种业务(定、活、理)办理的次数
        :return:
        """
        target_table = 't_CMMS_TEMP_ACTTIME'
        save_mode = 'append'
        acc_detail_source = self.sc.textFile('%s/jjzhu/test/input/t_CMMS_ACCOUNT_DETAIL.txt' % self.hdfs_root_url)
        # 这里是更新t_CMMS_TEMP_ACTTIME表的
        acc_list = self.load_from_mysql('core', 't_CMMS_ACCOUNT_LIST') \
            .map(lambda row: (row.asDict()['ACC_NO15'], (row.asDict()['CUST_NO'], row.asDict()['ACC_NAM']))).distinct()
        acc_list.cache()  # 添加缓存
        split_rdd = acc_detail_source.map(lambda line: line.split(','))
        split_rdd.cache()  # 添加缓存
        start_time = '2016-02-01'  # TODO 之后得改成最早的时间
        end_time = '2016-08-01'  # TODO 最晚的时间
        time_slots = self.get_time_slot(start_time, end_time)
        for slot in time_slots:
            self.logger.info('statistic action time of %s' % slot[0][0: slot[0].rindex('-')])
            # 以 客户号+业务类型为key,统计客户不同类型的交易次数
            act_time = split_rdd.filter(lambda columns: columns[1] <= slot[1]) \
                .filter(lambda columns: columns[1] >= slot[0]) \
                .map(lambda columns: (columns[3] + '_' + columns[6], 1)) \
                .reduceByKey(lambda a, b: a + b)

            mapped_act_time = act_time.map(lambda fields:
                                           (fields[0].split('_')[0],
                                            (fields[0].split('_')[0], fields[0].split('_')[1],
                                             fields[1], slot[0][0: slot[0].rindex('-')])))
            # join操作,连接客户号对应的具体信息
            result = mapped_act_time.join(acc_list).map(lambda fields: fields[1][0] + fields[1][1])

            #  ACCT_NO15, BUST_TYPE, ACT_TIME, CURR_MONTH, CUST_NO, ACCT_NAM
            # '101006463122653', '1', 25, '2016-02-01', '81024082971', '曹镇颜'

            result_writer = DataFrameWriter(
                self.sql_context.createDataFrame(result, ['ACCT_NO15', 'BUST_TYPE', 'ACT_TIME',
                                                          'CURR_MONTH', 'CUST_NO', 'ACCT_NAM']))
            self.logger.info('save the statistic result into mysql\n'
                             'url: %s\n'
                             'target table: %s\n'
                             'mode: %s' % (self.mysql_url, target_table, save_mode))
            result_writer.jdbc(self.mysql_url, table=target_table, mode=save_mode)
Exemple #8
0
    def init_balance(self, ctime, tp):
        """
        处理流程:
            方法接收 格式为'%Y-%m-%d'的ctime和业务类型tp
            while 当前月:
                调用存储过程 Calc_balance(该存储过程更新当前日期的余额)
                将当前余额保存到hdfs中
                天数+1
            将当前月每一天的余额复制到本地文件中进行合并xxxx_xx.data
            将xxxx_xx.data合并文件移到hdfs中去
            读取该文件,进行日均余额的计算,并保存结果到hdfs中
            将结果合并至一个文件xxxx_xx-r.data
            加载结果合并文件xxxx_xx-r.data,导入mysql数据库

        :param ctime:格式为'%Y-%m-%d'的ctime
        :param tp: 业务类型
        :return:
        """

        try:

            cdate = datetime.datetime.strptime(ctime, '%Y-%m-%d')
            end_date = cdate
            # 合并后的文件名为 年份_月份.data
            merged_file_name = '%s.data' % cdate.strftime('%Y_%m')

            result_file_name = '%s-r.data' % cdate.strftime('%Y_%m')
            # hdfs输入路径
            hdfs_root_path = '%s/jjzhu/test/_%s/%s/' % (self.hdfs_root_url, str(tp.value), cdate.strftime('%Y_%m'))
            # 本地的临时目录路径
            local_temp_path = '%s/temp/_%s/' % (self.local_root_url, str(tp.value))

            def exist_or_create(path):
                if not os.path.exists(path):
                    self.logger.warning('local path: %s is not exist' % path)
                    self.logger.info('creating dir: %s' % path)
                    os.makedirs(path)
                self.logger.info('local path: %s is already existed' % path)

            exist_or_create(local_temp_path)

            # 本地合并后的文件的输出路径
            local_output_path = '%s/output/_%s/' % (self.local_root_url, str(tp.value))
            exist_or_create(local_output_path)

            local_result_path = '%s/result/_%s/' % (self.local_root_url, str(tp.value))
            exist_or_create(local_result_path)
            # AUM计算结果的hdfs输出路径
            hdfs_save_path = '%s/jjzhu/test/result/_%s/%s' % (self.hdfs_root_url,
                                                              str(tp.value), cdate.strftime('%Y_%m'))
            # 计算AUM原文件的hdfs输入路径
            hdfs_input_path = '%s/jjzhu/test/input/_%s/' % (self.hdfs_root_url, str(tp.value))
            if os.system('hadoop fs -mkdir %s' % hdfs_input_path) == 0:
                self.logger.warning('%s is not exist, created it successful' % hdfs_input_path)
            target_save_table = 't_CMMS_ASSLIB_ASSET_c'  # TODO
            save_mode = 'append'
            local_output_file_path = local_output_path + merged_file_name
            local_dir_for_merge = local_temp_path + cdate.strftime('%Y_%m')

            # Row()转换成tuple
            def change_row(row):
                row_dict = row.asDict()
                return (row_dict['CUST_NO'], row_dict['ACCT_NO15'], str(row_dict['CURR_BAL']),
                        str(row_dict['TIME']), row_dict['CUR'],
                        row_dict['CURR_DATE'].strftime('%Y-%m-%d'),
                        str(row_dict['BUSI_TYPE']))

            def calc_avg(el):
                r = el[1]
                return (r[0], r[1], float(r[2]) / int(r[3]), r[4],
                        datetime.datetime.strptime(str(r[5]), '%Y-%m-%d').strftime('%Y-%m'), r[6])

            # 将hdfs上每个月的数据移到本地
            def copy_from_hdfs_to_local(hdfs_dir, local_dir):
                import sys
                if not os.path.exists(local_dir):
                    self.logger.warning('%s is not existed, create it first.' % local_dir)
                    os.system('mkdir ' + local_dir)
                    # exit(-1)
                shell_command = 'hadoop fs -copyToLocal %s %s' % (hdfs_dir, local_dir)
                self.logger.info('execute hdfs shell command: %s' % shell_command)
                os.system(shell_command)

            # 合并文件
            def merge_file(input_dir, output_path):
                import platform
                if output_path.__contains__('/data'):
                    output_file = open(output_path, 'w+')
                    if platform.system() == 'Windows':
                        deli = '\\'
                    elif platform.system() == 'Linux':
                        deli = '/'
                    else:
                        self.logger.error('unknown platform: %s' % platform.system())
                    for i in os.listdir(input_dir):
                        curr_dir = input_dir + deli + i
                        for j in os.listdir(curr_dir):
                            if j.startswith('part'):  # 只合并以part开头的输出文件
                                with open(curr_dir + deli + j) as f:
                                    # print(curr_dir+deli+j)
                                    for line in f.readlines():
                                        # print(line[1:len(line)-2])
                                        output_file.write(line[1:len(line) - 2] + '\n')

                    output_file.close()
                else:
                    self.logger.error('please make sure your input path is under /data')
                    exit(-1)

            while end_date.month == cdate.month:
                # print(end_date)
                hdfs_path = hdfs_root_path + end_date.strftime('%Y_%m_%d')
                # 调用mysql存储过程,更新当前余额
                self.logger.info('call Calc_balance(%s, %s)' %
                                 (end_date.strftime('%Y-%m-%d'), str(tp.value)))
                self.mysqlconn.execute_single('call Calc_balance(%s, %s)',
                                              (end_date.strftime('%Y-%m-%d'), str(tp.value)))
                # 获取当前余额
                curr_bal_df = self.load_from_mysql('core', 't_CMMS_TEMP_DAYBAL_T').select(
                    'CUST_NO', 'ACCT_NO15', 'CURR_BAL', 'TIME', 'CUR', 'CURR_DATE', 'BUSI_TYPE'
                )

                curr_bal_rdd = curr_bal_df.filter(curr_bal_df.BUSI_TYPE == tp.value).map(change_row)
                print(curr_bal_rdd.take(1))
                if curr_bal_rdd.count() == 0:
                    self.logger.warning('rdd is empty')
                    continue
                if os.system('hadoop fs -test -e ' + hdfs_path) == 0:
                    self.logger.warning('%s is already existed, deleting' % hdfs_path)
                    if os.system('hadoop fs -rm -r ' + hdfs_path) == 0:
                        self.logger.info('delete %s successful' % hdfs_path)
                # 保存当前余额信息到hdfs上以便合并
                self.logger.info('save rdd context to %s' % hdfs_path)
                curr_bal_rdd.saveAsTextFile(hdfs_path)
                end_date += datetime.timedelta(1)
            # 将当前月移到本地进行合并
            self.logger.info('copy file from %s to %s' % (hdfs_root_path, local_temp_path))
            copy_from_hdfs_to_local(hdfs_root_path, local_temp_path)
            self.logger.info('merge file content\n\tinput dir: %s\n\toutput_file: %s' %
                             (local_dir_for_merge, local_output_file_path))
            merge_file(local_dir_for_merge, local_output_file_path)
            # 合并后的文件移回hdfs作为AUM的输入
            if os.system('hadoop fs -test -e ' + hdfs_input_path) == 1:
                self.logger.warning('hdfs dir: %s is not exist' % hdfs_input_path)
                self.logger.info('\texecute hdfs command: hadoop fs -mkdir %s' % hdfs_input_path)
                os.system('hadoop fs -mkdir ' + hdfs_input_path)
            if os.system('hadoop fs -test -e ' + hdfs_input_path + merged_file_name) == 0:
                self.logger.info('hdfs file: %s is already exist' % hdfs_input_path + merged_file_name)
                self.logger.info('\texcute hdfs command: hadoop fs -rm ' + hdfs_input_path + merged_file_name)
                os.system('hadoop fs -rm ' + hdfs_input_path + merged_file_name)
            os.system('hadoop fs -put ' + local_output_file_path + ' ' + hdfs_input_path)

            # 计算AUM
            self.logger.info('start calculate AUM of %s' % cdate.strftime('%Y-%m'))
            all_data = self.sc.textFile(hdfs_input_path + merged_file_name)

            day_bal = all_data.map(lambda line: line.strip().split(',')) \
                .map(lambda fields: (fields[0].strip()[1:len(fields[0].strip()) - 1],  # CUST_NO
                                     fields[1].strip()[1:len(fields[1].strip()) - 1],  # ACCT_NO15
                                     float(fields[2].strip()[1:len(fields[2].strip()) - 1]),  # CURR_BAL
                                     1,  # TIME
                                     fields[4].strip()[1:len(fields[4].strip()) - 1],  # CUR
                                     fields[5].strip()[1:len(fields[5].strip()) - 1],  # CURR_DATE
                                     fields[6].strip()[1:len(fields[6].strip()) - 1],  # BUSI_TYPE
                                     ))
            # 用ACCT_NO15位主键,并累加余额和次数,然后在计算日均余额
            add_bal = day_bal.map(lambda fields: (fields[1], fields)) \
                .reduceByKey(lambda a, b: (
            a[0], a[1], float(float(a[2]) + float(b[2])), int(a[3]) + int(b[3]), a[4], max(a[5], b[5]), a[6])) \
                .map(calc_avg)
            # 判断保存目录路径是否存在,若存在,则将其删除,否则savaAsTexstFile操作会报错
            if os.system('hadoop fs -test -e ' + hdfs_save_path) == 0:
                os.system('hadoop fs -rm -r ' + hdfs_save_path)
            # add_bal.cache()
            add_bal.saveAsTextFile(hdfs_save_path)

            os.system('hadoop fs -getmerge %s %s%s' % (hdfs_save_path, local_result_path, result_file_name))
            os.system('hadoop fs -put %s%s %s' % (local_result_path, result_file_name,
                                                  hdfs_input_path))

            result_rdd = self.sc.textFile(hdfs_input_path + result_file_name)
            mapped_rdd = result_rdd.map(lambda line: line[1: len(line) - 1]) \
                .map(lambda line: line.split(',')) \
                .map(lambda fields: (fields[0].strip()[1:len(fields[0].strip()) - 1],
                                     fields[1].strip()[1:len(fields[1].strip()) - 1],  # ACCT_NO15
                                     float(fields[2].strip()[0:len(fields[2].strip())]),  # AUM
                                     fields[3].strip()[1:len(fields[3].strip()) - 1],  # CUR
                                     fields[4].strip()[1:len(fields[4].strip()) - 1],  # STAT_DAT
                                     fields[5].strip()[1:len(fields[5].strip()) - 1],  # ASS_TYPE
                                     ))
            # 重新创建对应的DataFrame并创建对应的writer
            writer = DataFrameWriter(self.sql_context.createDataFrame(mapped_rdd, ['CUST_NO', 'ACC_NO15', 'AUM', 'CUR',
                                                                                   'STAT_DAT', 'ASS_TYPE']))
            # self.mysql_url = 'jdbc:mysql://10.9.29.212:3306/core?user=root&characterEncoding=UTF-8'
            # 将DF中的内容以指定的方式保存到指定的表中
            writer.jdbc(self.mysql_url, table=target_save_table, mode=save_mode)
        except ValueError as e:
            self.logger.error(e)
            exit(-1)
Exemple #9
0
    def write(self, df, table, mode='append'):

        writer = DataFrameWriter(df)
        writer.jdbc(self.url_connect, table, mode, self.properties)
from pyspark import SparkConf
from pyspark.sql import SQLContext

# spark context
sc=SparkContext()
#sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", "XXXX")
#sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", "XXX")

# SparkSQL context
sqlContext = SQLContext(sc)

filename = "RC_2006-01.parquet"
destination="s3a://reddit-comments-raw/2006/comments_2006_05.parquet"
data = sqlContext.read.parquet(destination)
print("done",data.count())

from pyspark.sql import DataFrameWriter
my_writer = DataFrameWriter(data)


md="overwrite"
database_name = 'reddit'
hostname = 'ec2-54-214-117-182.us-west-2.compute.amazonaws.com'
url_connect = "jdbc:postgresql://{hostname}:5000/{db}".format(hostname=hostname, db=database_name)
properties = {"user":"******","password" : "webuiuser","driver": "org.postgresql.Driver"}
table="word_count"

my_writer.jdbc(url=url_connect,table= table,mode=md,properties=properties)