def main():
    db_base_option  = get_mysql_option(sys.argv[1])
    db_target_option = get_mysql_option(sys.argv[2])

    no_color_option = False
    if len(sys.argv) > 3 and sys.argv[3] == '--no-color':
        no_color_option = True

    db_base  = MySQLHelper(**db_base_option)
    db_target = MySQLHelper(**db_target_option)

    if db_base_option['passwd']:
        db_base_option['passwd'] = '***'

    if db_target_option['passwd']:
        db_target_option['passwd'] = '***'

    print '基准数据库:', ', '.join(['{}={}'.format(k, v) for k, v in db_base_option.items()])
    print '目标数据库:', ', '.join(['{}={}'.format(k, v) for k, v in db_target_option.items()])

    db_base_schema  = get_mysql_schema(db_base)
    db_target_schema = get_mysql_schema(db_target)

    schema_diff = compare_schema(db_base_schema, db_target_schema)

    if schema_diff:
        print '\n目标数据库相对于基准数据库存在以下差异:'
        print_schema_diff(schema_diff, no_color_option)

    else:
        print COLOR_GREEN + '\n数据库结构完全一致' + COLOR_RESET
Ejemplo n.º 2
0
    def __stop_command(self):
        try:
            self.orig_master_host = getattr(self, "orig_master_host")
            self.orig_master_config = ConfigHelper(self.orig_master_host)
        except Exception as e:
            print("Failed to read configuration for original master: %s" % str(e))
            return False

        # Original master
        try:
            orig_master_ip = getattr(self, "orig_master_ip", self.orig_master_host)
            orig_master_mysql_port = getattr(self, "orig_master_port", None)
            orig_master_ssh_ip = getattr(self, "orig_master_ssh_ip", orig_master_ip)
            orig_master_ssh_port = getattr(self, "orig_master_ssh_port", None)
            orig_master_ssh_user = getattr(self, "orig_master_ssh_user", None)
            orig_master_mysql_user = self.__unescape_from_shell(getattr(self, "orig_master_user"))
            orig_master_mysql_pass = self.__unescape_from_shell(getattr(self, "orig_master_password"))
        except AttributeError as e:
            print("Failed to read one or more required original master parameter(s): %s" % str(e))
            return False

        # Setup MySQL connections
        mysql_orig_master = MySQLHelper(orig_master_ip, orig_master_mysql_port, orig_master_mysql_user,
                                        orig_master_mysql_pass)

        try:
            print("Connecting to mysql on the original master '%s'" % self.orig_master_host)
            if not mysql_orig_master.connect():
                return False

            if self.orig_master_config.get_manage_vip():
                vip_type = self.orig_master_config.get_vip_type()
                print("Removing the vip using the '%s' provider from the original master '%s'" %
                      (vip_type, self.orig_master_host))

                if not self.__remove_vip_from_host(vip_type, self.orig_master_host, orig_master_ssh_ip,
                                                   orig_master_ssh_user, orig_master_ssh_port,
                                                   self.FAILOVER_TYPE_ONLINE):
                    return False

            if self.orig_master_config.get_super_read_only() and mysql_orig_master.super_read_only_supported():
                print("Setting super_read_only to '1' on the original master '%s'" % self.orig_master_host)
                if not mysql_orig_master.set_super_read_only() or not mysql_orig_master.is_super_read_only():
                    return False
            else:
                print("Setting read_only to '1' on the original master '%s'" % self.orig_master_host)
                if not mysql_orig_master.set_read_only() or not mysql_orig_master.is_read_only():
                    return False

            if not self.__mysql_kill_threads(self.orig_master_host, mysql_orig_master):
                return False
        except Exception as e:
            print("Unexpected error: %s" % str(e))
            return False
        finally:
            print("Disconnecting from mysql on the original master '%s'" % self.orig_master_host)
            mysql_orig_master.disconnect()

        return True
Ejemplo n.º 3
0
    def __init__(self):
        self.conf = (SparkConf()
                     .setAppName("BandCard")
                     .set("spark.cores.max", "2")
                     .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
        self.sc = SparkContext(conf=self.conf)
        self.sqlctx = SQLContext(self.sc)

        self.mysql_helper = MySQLHelper('core', host='10.9.29.212')
Ejemplo n.º 4
0
    def __rollback_stop_command(self):
        try:
            self.orig_master_host = getattr(self, "orig_master_host")
            self.orig_master_config = ConfigHelper(self.orig_master_host)
        except Exception as e:
            print("Failed to read configuration for original master: %s" % str(e))
            return False

        # Original master
        try:
            orig_master_ip = getattr(self, "orig_master_ip", self.orig_master_host)
            orig_master_mysql_port = getattr(self, "orig_master_port", None)
            orig_master_mysql_user = self.__unescape_from_shell(getattr(self, "orig_master_user"))
            orig_master_mysql_pass = self.__unescape_from_shell(getattr(self, "orig_master_password"))
            orig_master_ssh_ip = getattr(self, "orig_master_ssh_ip", orig_master_ip)
            orig_master_ssh_port = getattr(self, "orig_master_ssh_port", None)
            orig_master_ssh_user = getattr(self, "orig_master_ssh_user", None)
            orig_master_ssh_options = getattr(self, "ssh_options", None)
        except AttributeError as e:
            print("Failed to read one or more required original master parameter(s): %s" % str(e))
            return False

        # Setup MySQL connections
        mysql_orig_master = MySQLHelper(orig_master_ip, orig_master_mysql_port, orig_master_mysql_user,
                                        orig_master_mysql_pass)

        print("Rolling back the failover changes on the original master '%s'" % self.orig_master_host)
        try:
            if not mysql_orig_master.connect():
                print("Failed to connect to mysql on the original master '%s'" % self.orig_master_host)
                return False

            if not mysql_orig_master.unset_read_only() or mysql_orig_master.is_read_only():
                print("Failed to reset read_only to '0' on the original master '%s'" % self.orig_master_host)
                return False

            print("Set read_only back to '0' on the original master '%s'" % self.orig_master_host)

            if self.orig_master_config.get_manage_vip():
                vip_type = self.orig_master_config.get_vip_type()
                if not self.__add_vip_to_host(vip_type, self.orig_master_host, orig_master_ssh_ip, orig_master_ssh_user,
                                              orig_master_ssh_port, orig_master_ssh_options):
                    print("Failed to add back the vip using the '%s' provider to the original master '%s'" %
                          (vip_type, self.orig_master_host))
                    return False

                print("Added back the vip to the original master '%s'" % self.orig_master_host)
        except Exception as e:
            print("Unexpected error: %s" % str(e))
            return False
        finally:
            mysql_orig_master.disconnect()

        return True
Ejemplo n.º 5
0
class DataHandler:
    def __init__(self):
        self.conf = (SparkConf()
                     .setAppName("BandCard")
                     .set("spark.cores.max", "2")
                     .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
        self.sc = SparkContext(conf=self.conf)
        self.sqlctx = SQLContext(self.sc)

        self.mysql_helper = MySQLHelper('core', host='10.9.29.212')

    def load_from_mysql(self, table, database='core'):
        url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database
        df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load()
        return df




    def prepare_life_cycle(self, year, season):
        '''
        准备生命周期数据
        从t_CMMS_ASSLIB_ASSET中获取每日AUM数据
        prepare data

        saum1 (last season sum aum)
        saum2 (current season sum aum)
        aum_now
        account_age (months)
        last_tr_date (days)

        :param year:
        :param season: 1,2,3,4
        :return:
        '''

        # 计算月份
        print('----------------------生命周期-Start----------------------')

        print('开始准备生命周期数据...')
        print('开始计算月份')

        if season == 1:
            # date1 当前季度月份
            date1 = [str(year) + '-01', str(year) + '-02', str(year) + '-03']

            # date2 上一季月份
            date2 = [str(year - 1) + '-10', str(year - 1) + '-11', str(year - 1) + '-12']

        elif season == 4:
            date1 = [str(year) + '-10', str(year) + '-11', str(year) + '-12']
            date2 = [str(year) + '-07', str(year) + '-08', str(year) + '-9']

        else:
            date1 = [str(year) + '-0' + str(3 * season - 2), str(year) + '-0' + str(3 * season - 1),
                     str(year) + '-0' + str(3 * season)]
            date2 = [str(year) + '-0' + str(3 * season - 5), str(year) + '-0' + str(3 * season - 4),
                     str(year) + '-0' + str(3 * season - 3)]

        print('当前季度月份 new:', date1)
        print('上一季度月份 old:', date2)

        # 加载AUM表
        aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').cache()

        # 拼接每季度三个月断数据
        season_new = aum.filter(aum.STAT_DAT == date1[0]).unionAll(aum.filter(aum.STAT_DAT == date1[1])).unionAll(
            aum.filter(aum.STAT_DAT == date1[2]))
        season_old = aum.filter(aum.STAT_DAT == date2[0]).unionAll(aum.filter(aum.STAT_DAT == date2[1])).unionAll(
            aum.filter(aum.STAT_DAT == date2[2]))

        # 计算每季度AUM
        aum_season_old = season_old.select('CUST_NO', season_old.AUM.alias('AUM1')).groupBy('CUST_NO').sum('AUM1')
        aum_season_new = season_new.select('CUST_NO', season_new.AUM.alias('AUM2')).groupBy('CUST_NO').sum('AUM2')

        # 两个季度进行外联接
        '''
        +-----------+---------+---------+
        |    CUST_NO|sum(AUM2)|sum(AUM1)|
        +-----------+---------+---------+
        |81005329523|     null|294844.59|
        |81011793167|     null|   365.20|
        |81015319088|     null|  9640.96|
        +-----------+---------+---------+
        '''
        union_season = aum_season_old.join(aum_season_new, 'CUST_NO', 'outer')

        # 筛选当前AUM
        temp_result = aum.select('CUST_NO', 'AUM', 'STAT_DAT').groupBy('CUST_NO', 'STAT_DAT').sum('AUM').sort(
            'CUST_NO').sort(aum.STAT_DAT.desc())
        temp_result.select('CUST_NO', temp_result['sum(AUM)'].alias('AUM'), 'STAT_DAT').registerTempTable('group_in')

        aum_now_sql = "select CUST_NO,first(AUM) as AUM_NOW from group_in group by CUST_NO"

        aum_now = self.sqlctx.sql(aum_now_sql)
        # 清除缓存表
        self.sqlctx.dropTempTable('group_in')

        # 联合
        union_season_aumnow = union_season.join(aum_now, 'CUST_NO', 'outer')

        # 计算用户开户至今时间(months)
        # 载入账户表
        account = self.load_from_mysql('t_CMMS_ACCOUNT_LIST').cache()
        account.select('CUST_NO', 'OPEN_DAT').registerTempTable('account')
        account_age_aql = "select  CUST_NO, first(ACCOUNT_AGE) as ACCOUNT_AGE  from " \
                          "(select CUST_NO, round(datediff(now(), OPEN_DAT) / 30) as ACCOUNT_AGE " \
                          "from account order by CUST_NO, ACCOUNT_AGE desc ) as t group by CUST_NO"

        account_age = self.sqlctx.sql(account_age_aql)

        # calculate last tran date
        account_1 = account.select('CUST_NO', 'ACC_NO15')
        detail = self.load_from_mysql('t_CMMS_ACCOUNT_DETAIL').select('ACC_NO15', 'TRAN_DAT')
        a_d = account_1.join(detail, 'ACC_NO15', 'outer')
        a_d.filter(a_d.CUST_NO != '').registerTempTable('adtable')

        last_tr_date_sql = "select CUST_NO,first(TRAN_DAT) as LAST_TR_DATE from (select CUST_NO,TRAN_DAT from adtable order by TRAN_DAT desc) as t group by CUST_NO"

        last_tr_date = self.sqlctx.sql(last_tr_date_sql)

        # 联合 season   aum_now    account_age     last_tr_date
        unions = union_season_aumnow.join(account_age, 'CUST_NO', 'outer').join(last_tr_date, 'CUST_NO', 'outer')

        # 清除缓存表
        self.sqlctx.dropTempTable('account')
        self.sqlctx.dropTempTable('adtable')
        self.sqlctx.clearCache()

        # 结果插入表
        print('结果插入临时表:t_CMMS_TEMP_LIFECYCLE...')
        insert_lifecycle_sql = "replace into t_CMMS_TEMP_LIFECYCLE(CUST_NO,SAUM1,SAUM2,INCREASE,ACCOUNT_AGE,AUM_NOW,LAST_TR_DATE) values(%s,%s,%s,%s,%s,%s,%s)"

        # 缓冲区
        temp = []
        for row in unions.collect():
            row_dic = row.asDict()

            if len(temp) >= 1000:  # 批量写入数据库
                self.mysql_helper.executemany(insert_lifecycle_sql, temp)
                temp.clear()

            # 加载数据到缓冲区

            try:
                # 计算增长率
                increase = (row_dic['sum(AUM2)'] - row_dic['sum(AUM1)']) / row_dic['sum(AUM1)']
            except Exception:
                increase = 0

            # 计算开户时长(月份数) 若无则视为6个月以上
            if row_dic['ACCOUNT_AGE'] is None:
                row_dic['ACCOUNT_AGE'] = 7

            # 最后交易日期
            ltd = row_dic['LAST_TR_DATE']
            if ltd is not None:
                try:
                    ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d')
                except Exception:
                    ltd = ltd[:4] + '-' + ltd[4:6] + '-' + ltd[6:]
                    ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d')

                days = (datetime.datetime.now() - ltd).days
            else:
                days = 366

            temp.append((row_dic['CUST_NO'], row_dic['sum(AUM1)'], row_dic['sum(AUM2)'], increase,
                         row_dic['ACCOUNT_AGE'], row_dic['AUM_NOW'], days))

        if len(temp) != 0:
            self.mysql_helper.executemany(insert_lifecycle_sql, temp)
            temp.clear()








    def calculate_life_cycle(self):

        '''
        根据AUM变化情况计算生命周期阶段
        calculate life cycle period
        :return:
        '''

        print('开始计算生命周期...')
        life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').cache()

        def clcmap(line):
            cust_no = line['CUST_NO']
            account_age = line['ACCOUNT_AGE']
            last_tr_date = line['LAST_TR_DATE']
            aum_now = line['AUM_NOW']
            increase = line['INCREASE']

            period = 0
            if aum_now is None:
                period = 9  # 未知
            elif aum_now < 1000 and last_tr_date > 365:
                period = 3  # 流失期
            else:
                if increase > 20 or account_age < 6:
                    period = 0  # 成长期
                elif increase >= -20 and increase <= 20:
                    period = 1  # 成熟期
                else:
                    period = 2  # 稳定期

            return period, cust_no


        map_result = life_cycle.map(clcmap).collect()

        # clear the life_cycle cache
        self.sqlctx.clearCache()

        temp = []
        print('结果更新到临时表:t_CMMS_TEMP_LIFECYCLE...')
        update_life_period_sql = "update t_CMMS_TEMP_LIFECYCLE set PERIOD = %s where CUST_NO = %s"
        for row in map_result:

            if len(temp) >= 1000:
                self.mysql_helper.executemany(update_life_period_sql, temp)
                temp.clear()
            temp.append(row)

        if len(temp) != 1000:
            self.mysql_helper.executemany(update_life_period_sql, temp)
            temp.clear()







    def lifecycle_to_real_table(self, year, season):
        '''
        将生命周期数据写入正式表中
        put life_cycle tmp table to real table
        :return:
        '''

        print('开始将生命周期数据写入正式表中...')
        life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').select('CUST_NO', 'PERIOD')
        cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM')

        union = life_cycle.join(cust_info, 'CUST_NO', 'left_outer').cache()

        temp = []
        sql = "replace into t_CMMS_ANALYSE_LIFE(CUST_NO,CUST_ID,CUST_NM,LIFE_CYC,QUARTER,UPDATE_TIME) values(%s,%s,%s,%s,%s,now())"
        quarter = str(year) + '-' + str(season)
        for row in union.collect():

            if len(temp) >= 1000:
                self.mysql_helper.executemany(sql, temp)
                temp.clear()

            cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else '0'
            temp.append((row['CUST_NO'], cust_id, row['CUST_NAM'], row['PERIOD'], quarter))

        if len(temp) != 1000:
            self.mysql_helper.executemany(sql, temp)
            temp.clear()

        self.sqlctx.clearCache()





    def run_life_cycle(self,year,season):
        '''
        运行完整的生命周期流程
        1 准备生命周期数据,计算AUM及其变化幅度
        2 根据变化幅度计算生命周期阶段
        3 将数据从缓存表放到实际表
        :param year:
        :param season:
        :return:
        '''
        self.prepare_life_cycle(year,season)
        self.calculate_life_cycle()
        self.lifecycle_to_real_table(year,season)




#------------------------------------------------------------------------生命周期结束------------------------------------------------------------------------#














    def customer_value(self, year, half_year):
        '''
        计算客户价值
        calculate customer value
        :param year: which year to calculate
        :param half_year: 0 for month 1-6 , 1 for month 7-12
        :return:
        '''

        print('---------------------------客户价值-Start--------------------------')
        cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM').cache()
        aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'STAT_DAT', 'AUM', 'ASS_TYPE').cache()

        base = half_year * 6

        aum_slot_filter = None

        for i in range(1, 7):
            i = base + i
            if i < 10:
                i = '0' + str(i)
            else:
                i = str(i)
            slot = str(year) + '-' + i

            slot_filter = aum.filter(aum.STAT_DAT == slot)
            if aum_slot_filter is None:
                aum_slot_filter = slot_filter
            else:
                aum_slot_filter = aum_slot_filter.unionAll(slot_filter)

        # CUST_NO sum(AUM)
        huoqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', aum_slot_filter['AUM'].alias('AUM_HQ')).filter(
            aum_slot_filter.ASS_TYPE == '1').groupBy('CUST_NO').sum('AUM_HQ')
        dingqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', (aum_slot_filter.AUM * 0.8).alias('AUM_DQ')).filter(
            aum_slot_filter.ASS_TYPE == '2').groupBy('CUST_NO').sum('AUM_DQ')

        # 定期活期已计算好,sum(AUM_HQ),sum(AUM_DQ)
        j = huoqi_aum.join(dingqi_aum, 'CUST_NO', 'outer')
        # j.show()


        # 清除原有数据
        self.mysql_helper.execute('truncate core.t_CMMS_ANALYSE_VALUE')

        # 开始联合其他表
        all_col = j.join(cust_info, 'CUST_NO', 'outer')

        print(j.count(), cust_info.count())

        # all_col.show()

        #根据客户价值计算客户等级
        def calculate_rank(value):
            if value < 1000:
                return 0
            elif value < 10000:
                return 1
            elif value < 100000:
                return 2
            elif value < 500000:
                return 3
            elif value < 2000000:
                return 4
            elif value < 5000000:
                return 5
            else:
                return 6

        temp = []
        print('将数据replace到正式表...')
        update_value_sql = "replace into t_CMMS_ANALYSE_VALUE(CUST_ID,CUST_NO,CUST_NM,CUST_VALUE,CUST_RANK,SLOT,UPDATE_TIME) values(%s,%s,%s,%s,%s,%s,now())"
        for row in all_col.collect():

            if len(temp) >= 1000:
                self.mysql_helper.executemany(update_value_sql, temp)
                temp.clear()

            val_dq = row['sum(AUM_DQ)'] if row['sum(AUM_DQ)'] is not None else 0
            val_hq = row['sum(AUM_HQ)'] if row['sum(AUM_HQ)'] is not None else 0

            cust_val = float(val_dq) + float(val_hq)

            cust_rank = calculate_rank(cust_val)

            slot = str(year) + '-' + str(half_year)
            cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else 1
            temp.append((cust_id, row['CUST_NO'], row['CUST_NAM'], cust_val, cust_rank, slot))

        if len(temp) != 1000:
            self.mysql_helper.executemany(update_value_sql, temp)
            temp.clear()





    def aum_total(self):
        '''
        计算AUM总和
        data for t_CMMS_ASSLIB_ASSTOT
        :return:
        '''
        print('---------------------------总资产-Start--------------------------')
        # TODO t_CMMS_ASSLIB_ASSET_c 要改成正式表t_CMMS_ASSLIB_ASSET
        df_asset = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'CUST_ID', 'STAT_DAT', 'AUM', 'CUR',
                                                                      'ACC_NAM').cache()
        # print(df_asset.count(), df_asset.columns)

        other_col = df_asset.select('CUST_NO', 'CUST_ID', 'CUR', 'ACC_NAM').distinct()
        # print(other_col.count(),other_col.columns)

        aum = df_asset.select('CUST_NO', 'STAT_DAT', 'AUM')
        # print(aum.count(), aum.columns)

        aum = aum.select('CUST_NO', 'STAT_DAT', 'AUM').groupBy(['CUST_NO', 'STAT_DAT']).sum('AUM').sort(
            ['CUST_NO', aum.STAT_DAT.desc()]) \
            .groupBy('CUST_NO').agg({'sum(AUM)': 'first', 'STAT_DAT': 'first'})
        # print(aum.count(), aum.columns)


        total = aum.select('CUST_NO', aum['first(sum(AUM))'].alias('AUM'), aum['first(STAT_DAT)'].alias('STAT_DAT')). \
            join(other_col, 'CUST_NO', 'left_outer').distinct()

        # total.filter(total.STAT_DAT == '2016-06-') .show()

        # prepare params
        def list_map(line):
            return line['CUST_ID'], line['CUST_NO'], line['ACC_NAM'], line['STAT_DAT'], line['CUR'], line['AUM']

        df = total.map(list_map)

        # clear old data
        self.mysql_helper.execute('truncate t_CMMS_ASSLIB_ASSTOT')
        sql = "insert into t_CMMS_ASSLIB_ASSTOT(CUST_ID,CUST_NO,ACC_NAM,STAT_DAT,CUR,AUM) values(%s,%s,%s,%s,%s,%s)"

        # execute sql
        self.mysql_helper.batch_operate(sql, df, 100)

    def debt_total(self):
        '''
        prepare data for total debt
        :return:
        '''

        print('---------------------------总负债-Start--------------------------')
        df_debt = self.load_from_mysql('t_CMMS_ASSLIB_DEBT').select('LOAN_ACC', 'CUST_NO', 'CUST_ID', 'CUST_NAM',
                                                                    'BAL_AMT', 'GRANT_AMT', 'CUR')
        df_debt = df_debt.filter(df_debt.LOAN_ACC != '')

        df_sum = df_debt.groupBy('CUST_NO').sum('GRANT_AMT', 'BAL_AMT')
        df_other = df_debt.groupBy('CUST_NO').agg({'CUST_ID': 'first', 'CUST_NAM': 'first', 'CUR': 'first'})

        df_total = df_sum.join(df_other, 'CUST_NO', 'left_outer').distinct()

        stat_dat = datetime.datetime.now().strftime('%Y%m%d')

        def m(line):
            return line['CUST_NO'], line['first(CUST_ID)'], line['first(CUST_NAM)'], line['first(CUR)'], line[
                'sum(GRANT_AMT)'], line['sum(BAL_AMT)'], stat_dat

        df = df_total.map(m)

        sql = "replace into t_CMMS_ASSLIB_DEBTOT(CUST_NO,CUST_ID,ACC_NAM,CUR,LOAN_AMT,BAL_AMT,STAT_DAT) values(%s,%s,%s,%s,%s,%s,%s)"

        self.mysql_helper.batch_operate(sql, df)




    def  run(self):
        # 生命周期 年份 季度1,2,3,4
        dh.run_life_cycle(2016, 2)

        # 客户价值 上半年:0,下半年:1
        dh.customer_value(2016, 0)

        # 总资产
        dh.aum_total()

        # 总负债
        dh.debt_total()
Ejemplo n.º 6
0
    def __start_command(self):
        try:
            self.orig_master_host = getattr(self, "orig_master_host")
            self.orig_master_config = ConfigHelper(self.orig_master_host)
        except Exception as e:
            print("Failed to read configuration for original master: %s" % str(e))
            return False

        try:
            self.new_master_host = getattr(self, "new_master_host")
            self.new_master_config = ConfigHelper(self.new_master_host)
        except Exception as e:
            print("Failed to read configuration for new master: %s" % str(e))
            return False

        # New master
        try:
            new_master_ip = getattr(self, "new_master_ip", self.new_master_host)
            new_master_mysql_port = getattr(self, "new_master_port", None)
            new_master_mysql_user = self.__unescape_from_shell(getattr(self, "new_master_user"))
            new_master_mysql_pass = self.__unescape_from_shell(getattr(self, "new_master_password"))
            new_master_ssh_ip = getattr(self, "new_master_ssh_ip", new_master_ip)
            new_master_ssh_port = getattr(self, "new_master_ssh_port", None)
            new_master_ssh_options = getattr(self, "ssh_options", None)

            if self.failover_type == self.FAILOVER_TYPE_HARD:
                new_master_ssh_user = getattr(self, "ssh_user", None)
            else:
                new_master_ssh_user = getattr(self, "new_master_ssh_user", None)
        except AttributeError as e:
            print("Failed to read one or more required new master parameter(s): %s" % str(e))
            return False

        # Setup MySQL connection
        mysql_new_master = MySQLHelper(new_master_ip, new_master_mysql_port, new_master_mysql_user,
                                       new_master_mysql_pass)

        try:
            print("Connecting to mysql on the new master '%s'" % self.new_master_host)
            if not mysql_new_master.connect():
                return False

            print("Setting read_only to '0' on the new master '%s'" % self.new_master_host)
            if not mysql_new_master.unset_read_only() or mysql_new_master.is_read_only():
                return False

            if self.new_master_config.get_manage_vip():
                vip_type = self.new_master_config.get_vip_type()
                print("Adding the vip using the '%s' provider to the new master '%s'" %
                      (vip_type, self.new_master_host))

                if not self.__add_vip_to_host(vip_type, self.new_master_host, new_master_ssh_ip, new_master_ssh_user,
                                              new_master_ssh_port, new_master_ssh_options):
                    return False
        except Exception as e:
            print("Unexpected error: %s" % str(e))
            return False
        finally:
            print("Disconnecting from mysql on the new master '%s'" % self.new_master_host)
            mysql_new_master.disconnect()

        return True
Ejemplo n.º 7
0
    def __stop_command(self):
        try:
            self.orig_master_host = getattr(self, "orig_master_host")
            self.orig_master_config = ConfigHelper(self.orig_master_host)
        except Exception as e:
            print("Failed to read configuration for original master: %s" %
                  str(e))
            return False

        # Original master
        try:
            orig_master_ip = getattr(self, "orig_master_ip",
                                     self.orig_master_host)
            orig_master_mysql_port = getattr(self, "orig_master_port", None)
            orig_master_ssh_ip = getattr(self, "orig_master_ssh_ip",
                                         orig_master_ip)
            orig_master_ssh_port = getattr(self, "orig_master_ssh_port", None)
            orig_master_ssh_user = getattr(self, "orig_master_ssh_user", None)
            orig_master_mysql_user = self.__unescape_from_shell(
                getattr(self, "orig_master_user"))
            orig_master_mysql_pass = self.__unescape_from_shell(
                getattr(self, "orig_master_password"))
        except AttributeError as e:
            print(
                "Failed to read one or more required original master parameter(s): %s"
                % str(e))
            return False

        # Setup MySQL connections
        mysql_orig_master = MySQLHelper(orig_master_ip, orig_master_mysql_port,
                                        orig_master_mysql_user,
                                        orig_master_mysql_pass)

        try:
            print("Connecting to mysql on the original master '%s'" %
                  self.orig_master_host)
            if not mysql_orig_master.connect():
                return False

            if self.orig_master_config.get_manage_vip():
                vip_type = self.orig_master_config.get_vip_type()
                print(
                    "Removing the vip using the '%s' provider from the original master '%s'"
                    % (vip_type, self.orig_master_host))

                if not self.__remove_vip_from_host(
                        vip_type, self.orig_master_host, orig_master_ssh_ip,
                        orig_master_ssh_user, orig_master_ssh_port,
                        self.FAILOVER_TYPE_ONLINE):
                    return False

            if self.orig_master_config.get_super_read_only(
            ) and mysql_orig_master.super_read_only_supported():
                print(
                    "Setting super_read_only to '1' on the original master '%s'"
                    % self.orig_master_host)
                if not mysql_orig_master.set_super_read_only(
                ) or not mysql_orig_master.is_super_read_only():
                    return False
            else:
                print("Setting read_only to '1' on the original master '%s'" %
                      self.orig_master_host)
                if not mysql_orig_master.set_read_only(
                ) or not mysql_orig_master.is_read_only():
                    return False

            if not self.__mysql_kill_threads(self.orig_master_host,
                                             mysql_orig_master):
                return False
        except Exception as e:
            print("Unexpected error: %s" % str(e))
            return False
        finally:
            print("Disconnecting from mysql on the original master '%s'" %
                  self.orig_master_host)
            mysql_orig_master.disconnect()

        return True
Ejemplo n.º 8
0
    def __rollback_stop_command(self):
        try:
            self.orig_master_host = getattr(self, "orig_master_host")
            self.orig_master_config = ConfigHelper(self.orig_master_host)
        except Exception as e:
            print("Failed to read configuration for original master: %s" %
                  str(e))
            return False

        # Original master
        try:
            orig_master_ip = getattr(self, "orig_master_ip",
                                     self.orig_master_host)
            orig_master_mysql_port = getattr(self, "orig_master_port", None)
            orig_master_mysql_user = self.__unescape_from_shell(
                getattr(self, "orig_master_user"))
            orig_master_mysql_pass = self.__unescape_from_shell(
                getattr(self, "orig_master_password"))
            orig_master_ssh_ip = getattr(self, "orig_master_ssh_ip",
                                         orig_master_ip)
            orig_master_ssh_port = getattr(self, "orig_master_ssh_port", None)
            orig_master_ssh_user = getattr(self, "orig_master_ssh_user", None)
        except AttributeError as e:
            print(
                "Failed to read one or more required original master parameter(s): %s"
                % str(e))
            return False

        # Setup MySQL connections
        mysql_orig_master = MySQLHelper(orig_master_ip, orig_master_mysql_port,
                                        orig_master_mysql_user,
                                        orig_master_mysql_pass)

        print("Rolling back the failover changes on the original master '%s'" %
              self.orig_master_host)
        try:
            if not mysql_orig_master.connect():
                print(
                    "Failed to connect to mysql on the original master '%s'" %
                    self.orig_master_host)
                return False

            if not mysql_orig_master.unset_read_only(
            ) or mysql_orig_master.is_read_only():
                print(
                    "Failed to reset read_only to '0' on the original master '%s'"
                    % self.orig_master_host)
                return False

            print("Set read_only back to '0' on the original master '%s'" %
                  self.orig_master_host)

            if self.orig_master_config.get_manage_vip():
                vip_type = self.orig_master_config.get_vip_type()
                if not self.__add_vip_to_host(
                        vip_type, self.orig_master_host, orig_master_ssh_ip,
                        orig_master_ssh_user, orig_master_ssh_port):
                    print(
                        "Failed to add back the vip using the '%s' provider to the original master '%s'"
                        % (vip_type, self.orig_master_host))
                    return False

                print("Added back the vip to the original master '%s'" %
                      self.orig_master_host)
        except Exception as e:
            print("Unexpected error: %s" % str(e))
            return False
        finally:
            mysql_orig_master.disconnect()

        return True
Ejemplo n.º 9
0
    def __start_command(self):
        try:
            self.orig_master_host = getattr(self, "orig_master_host")
            self.orig_master_config = ConfigHelper(self.orig_master_host)
        except Exception as e:
            print("Failed to read configuration for original master: %s" %
                  str(e))
            return False

        try:
            self.new_master_host = getattr(self, "new_master_host")
            self.new_master_config = ConfigHelper(self.new_master_host)
        except Exception as e:
            print("Failed to read configuration for new master: %s" % str(e))
            return False

        # New master
        try:
            new_master_ip = getattr(self, "new_master_ip",
                                    self.new_master_host)
            new_master_mysql_port = getattr(self, "new_master_port", None)
            new_master_mysql_user = self.__unescape_from_shell(
                getattr(self, "new_master_user"))
            new_master_mysql_pass = self.__unescape_from_shell(
                getattr(self, "new_master_password"))
            new_master_ssh_ip = getattr(self, "new_master_ssh_ip",
                                        new_master_ip)
            new_master_ssh_port = getattr(self, "new_master_ssh_port", None)

            if self.failover_type == self.FAILOVER_TYPE_HARD:
                new_master_ssh_user = getattr(self, "ssh_user", None)
            else:
                new_master_ssh_user = getattr(self, "new_master_ssh_user",
                                              None)
        except AttributeError as e:
            print(
                "Failed to read one or more required new master parameter(s): %s"
                % str(e))
            return False

        # Setup MySQL connection
        mysql_new_master = MySQLHelper(new_master_ip, new_master_mysql_port,
                                       new_master_mysql_user,
                                       new_master_mysql_pass)

        try:
            print("Connecting to mysql on the new master '%s'" %
                  self.new_master_host)
            if not mysql_new_master.connect():
                return False

            print("Setting read_only to '0' on the new master '%s'" %
                  self.new_master_host)
            if not mysql_new_master.unset_read_only(
            ) or mysql_new_master.is_read_only():
                return False

            if self.new_master_config.get_manage_vip():
                vip_type = self.new_master_config.get_vip_type()
                print(
                    "Adding the vip using the '%s' provider to the new master '%s'"
                    % (vip_type, self.new_master_host))

                if not self.__add_vip_to_host(
                        vip_type, self.new_master_host, new_master_ssh_ip,
                        new_master_ssh_user, new_master_ssh_port):
                    return False
        except Exception as e:
            print("Unexpected error: %s" % str(e))
            return False
        finally:
            print("Disconnecting from mysql on the new master '%s'" %
                  self.new_master_host)
            mysql_new_master.disconnect()

        return True
Ejemplo n.º 10
0
class KMData:
    def __init__(self):
        self.conf = (SparkConf()
                     .setAppName("KMeans")
                     .set("spark.cores.max", "2")
                     .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
        self.sc = SparkContext(conf=self.conf)
        self.sqlctx = SQLContext(self.sc)
        self.mysql_helper = MySQLHelper('core', host='10.9.29.212')
        self.base = 'hdfs://master:9000/gmc/'

    def load_from_mysql(self, table, database='core'):
        url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database
        df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load()
        return df

    def rfm(self):
        life_cycle = self.load_from_mysql('t_CMMS_ANALYSE_LIFE').select('CUST_NO', 'LIFE_CYC')
        value = self.load_from_mysql('t_CMMS_ANALYSE_VALUE').select('CUST_NO', 'CUST_VALUE', 'CUST_RANK')
        loyalty = self.load_from_mysql('t_CMMS_ANALYSE_LOYALTY').select('CUST_NO', 'LOYALTY')
        rfm = loyalty.join(value, 'CUST_NO', 'left_outer').join(life_cycle, 'CUST_NO', 'left_outer').distinct() \
            .map(lambda x: (x['CUST_NO'], x['LIFE_CYC'], x['CUST_VALUE'], x['CUST_RANK'], x['LOYALTY']))

        temp = []
        sql = "replace into t_CMMS_TEMP_KMEANS_COLUMNS(CUST_NO,LIFE_CYC,CUST_VALUE,CUST_RANK,LOYALTY) values(%s,%s,%s,%s,%s)"
        for row in rfm.collect():
            if len(temp) >= 1000:
                self.mysql_helper.executemany(sql, temp)
                temp.clear()
            temp.append(row)

            if len(temp) != 1000:
                self.mysql_helper.executemany(sql, temp)
                temp.clear()

    def idcard(self):
        '''
        get sex age and local from id card number
        :return:
        '''

        kcolums = self.load_from_mysql('t_CMMS_TEMP_KMEANS_COLUMNS').select('CUST_NO')
        cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER')

        j = kcolums.join(cust_info, 'CUST_NO', 'left_outer').select('CUST_NO', 'CUST_ID')

        year_now = datetime.datetime.now().year

        def split_map(line):

            no = line['CUST_NO']
            id = line['CUST_ID']

            if id is not None:
                sex_flag = 2
                year = 0

                if len(id) == 21:
                    year = id[9:13]
                    sex_flag = id[-2:-1]
                elif len(line) == 18:
                    year = '19' + id[9:11]
                    sex_flag = id[-1]

                # 1 man   2 woman
                sex = int(sex_flag) % 2
                age = year_now - int(year)
                if age > 100:
                    age = 0
                local = id[3:9]

                return age, sex, local, no

            else:
                return 0, 2, '000000', no

        asl = j.map(split_map).collect()

        temp = []
        sql = "update t_CMMS_TEMP_KMEANS_COLUMNS set AGE = %s, SEX = %s, LOCAL = %s where CUST_NO = %s"
        for row in asl:
            if len(temp) >= 1000:
                self.mysql_helper.executemany(sql, temp)
                temp.clear()
            temp.append(row)

            if len(temp) != 1000:
                self.mysql_helper.executemany(sql, temp)
                temp.clear()

    def aum(self):
        kcolums = self.load_from_mysql('t_CMMS_TEMP_KMEANS_COLUMNS').select('CUST_NO')
        aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSTOT').select('CUST_NO', 'AUM')
        j = kcolums.join(aum, 'CUST_NO', 'left_outer').distinct()

        def m(line):
            aum = line['AUM'] if line['AUM'] is not None else 0
            return aum, line['CUST_NO']

        df = j.map(m)
        sql = "update t_CMMS_TEMP_KMEANS_COLUMNS set AUM = %s where CUST_NO = %s"
        self.sql_operate(sql, df)

    def sql_operate(self, sql, df, once_size=1000):
        '''
        批量数据库操作
        :param sql:
        :param df:
        :param once_size:
        :return:
        '''
        temp = []
        for row in df.collect():
            if len(temp) >= once_size:
                self.mysql_helper.executemany(sql, temp)
                temp.clear()
            temp.append(row)

        if len(temp) != 0:
            self.mysql_helper.executemany(sql, temp)
            temp.clear()

    def cust_info(self):
        self.rfm()
        self.idcard()
        self.aum()

    '''
        ************************************************************
    '''



    #TODO 时间跨度未设置
    def credit_card(self):

        #清空缓存表,导入新的额度数据
        init_credit_sql = "replace into t_CMMS_TEMP_KMEANS_CREDIT(ACCTNBR,CREDIT) (select XACCOUNT,CRED_LIMIT from core.ACCT)"
        self.mysql_helper.execute('truncate t_CMMS_TEMP_KMEANS_CREDIT')
        self.mysql_helper.execute(init_credit_sql)



        #信用卡交易记录
        credit_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN')
        credit_df = credit_df.filter(credit_df['BILL_AMTFLAG'] == '+').cache()



        #计算循环(此处为刷卡金额占额度的比例,不是真正的循环信用)
        # join = credit_df.groupBy('ACCTNBR').sum('BILL_AMT').join(credit_limit_df,'ACCTNBR','outer')
        #
        # cycle = join.select('ACCTNBR',join['sum(BILL_AMT)'].alias('CYCLE_AMT'),'CREDIT')

        #刷卡金额与次数
        swipe_amt = credit_df.groupBy('ACCTNBR').sum('BILL_AMT')
        swipe_times = credit_df.groupBy('ACCTNBR').count()

        # swipe_amt.show()
        # swipe_times.show()

        swipe_result = swipe_amt.join(swipe_times,'ACCTNBR','outer')

        swipe_result = swipe_result.select('ACCTNBR',swipe_result['sum(BILL_AMT)'].alias('SWIPE_AMT'),swipe_result['count'].alias('SWIPE_TIMES'))






        #取现金额与次数
        codes = [2010,2050,2060,2182,2184]
        cash = credit_df.filter(credit_df['TRANS_TYPE'] == codes.pop())
        print(type(cash))
        for c in codes:
            cash = cash.unionAll(credit_df.filter(credit_df['TRANS_TYPE'] == c))


        cash_amt = cash.groupBy('ACCTNBR').sum('BILL_AMT')
        cash_count = cash.groupBy('ACCTNBR').count()

        cash_result = cash_amt.join(cash_count,'ACCTNBR','outer')
        cash_result = cash_result.select('ACCTNBR',cash_result['sum(BILL_AMT)'].alias('CASH_AMT'),cash_result['count'].alias('CASH_TIMES'))

        #cash_amt.show()
        #cash_count.show()




        #分期付款金额与次数
        installment_df = self.load_from_mysql('MPUR_D').cache()
        im_amt = installment_df.groupBy('XACCOUNT').sum('ORIG_PURCH')
        im_times = installment_df.groupBy('XACCOUNT').count()

        im_result = im_amt.join(im_times,'XACCOUNT','outer')
        im_result = im_result.select(im_result['XACCOUNT'].alias('ACCTNBR'),im_result['sum(ORIG_PURCH)'].alias('INSTALLMENT_AMT'),im_result['count'].alias('INSTALLMENT_TIMES'))




        #汇总

        # 加载额度数据
        credit_limit_df = self.load_from_mysql('t_CMMS_TEMP_KMEANS_CREDIT').select('ACCTNBR', 'CREDIT')

        all_join = swipe_result.join(cash_result,'ACCTNBR','outer').join(im_result,'ACCTNBR','outer').join(credit_limit_df,'ACCTNBR','outer')

        all_join.show()

        def m(line):
            acctnbr = line['ACCTNBR']
            credit = line['CREDIT'] if line['CREDIT'] is not None else 0
            # cycle_amt = line['CYCLE_AMT'] if line['CYCLE_AMT'] is not None else 0
            # try:
            #     cycle_times = cycle_amt/credit
            # except ZeroDivisionError:
            #     cycle_times = 0

            swipe_amt = line['SWIPE_AMT'] if line['SWIPE_AMT'] is not None else 0
            swipe_times = line['SWIPE_TIMES'] if line['SWIPE_TIMES'] is not None else 0
            cash_amt = line['CASH_AMT'] if line['CASH_AMT'] is not None else 0
            cash_times = line['CASH_TIMES'] if line['CASH_TIMES'] is not None else 0
            installment_amt = line['INSTALLMENT_AMT'] if line['INSTALLMENT_AMT'] is not None else 0
            installment_times = line['INSTALLMENT_TIMES'] if line['INSTALLMENT_TIMES'] is not None else 0



            return acctnbr,credit,swipe_amt,swipe_times,cash_amt,cash_times,installment_amt,installment_times


        sql = "replace into t_CMMS_TEMP_KMEANS_CREDIT(ACCTNBR,CREDIT,SWIPE_AMT,SWIPE_TIMES,CASH_AMT,CASH_TIMES,INSTALLMENT_AMT,INSTALLMENT_TIMES) values(%s,%s,%s,%s,%s,%s,%s,%s)"
        df = all_join.map(m)

        self.sql_operate(sql,df)
Ejemplo n.º 11
0
class KMAnalyse:
    def __init__(self):
        self.conf = (SparkConf()
                     .setAppName("KMeans")
                     .set("spark.cores.max", "2")
                     .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
        self.sc = SparkContext(conf=self.conf)
        self.sqlctx = SQLContext(self.sc)
        self.mysql_helper = MySQLHelper('core', host='10.9.29.212')
        self.base = 'hdfs://master:9000/gmc/'

    def load_from_mysql(self, table, database='core'):
        url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database
        df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load()
        return df

    @staticmethod
    def clustering_score(data, k):
        model = KMeans.train(data, k=k)

        def distance(v1, v2):
            s = 0
            # [1,2,3] [4,5,6] --> [(1,4),(2,5),(3,6)]
            pairs = zip(v1, v2)
            for p in pairs:
                sub = float(p[0]) - float(p[1])
                s = s + sub * sub
            return math.sqrt(s)

        def dist_to_centroid(datum):
            # predict the data
            cluster = model.predict(datum)
            # get the current centroid --> means center point
            centroid = model.clusterCenters[cluster]
            # call distance method
            return distance(centroid, datum)

        return data.map(dist_to_centroid).mean()

    def try_different_k(self, dataframe, max_k, min_k=2):
        data = self.prepare_data(dataframe)

        get_id_sql = "select ID from t_CMMS_TEMP_KMEANS_RESULT order by ID desc limit 1"
        try:
            id = int(self.mysql_helper.fetchone(get_id_sql)[0]) + 1
        except:
            id = 1
        columns = str(dataframe.columns)
        l = []
        for k in range(min_k, max_k):
            sorce = self.clustering_score(data, k)
            print(k, sorce)
            l.append(sorce)
            self.mysql_helper.execute('insert into t_CMMS_TEMP_KMEANS_RESULT(ID,K,SORCE,COLUMNS) values(%s,%s,%s,%s)', (id, k, sorce, columns))
        return id


    def prepare_data(self, dataframe):
        '''
        format data
        :param dataframe:
        :return:
        '''

        def v_map(line):
            lst = []
            for c in line:
                if c is not None:
                    lst.append(float(c))
                else:
                    lst.append(0.0)
            return lst

        return dataframe.rdd.cache().map(v_map)

    def train_model(self, dataframe, k, model_name):
        '''
        use data to train model
        :param dataframe: all columns for train
        :param k:k value
        :param model_name:the trained model
        :return:None
        '''

        data = self.prepare_data(dataframe)

        # train to get model
        model = KMeans.train(data, k)

        # create model saving path
        path = self.base + model_name

        # try to delete the old model if it exists
        try:
            import subprocess
            subprocess.call(["hadoop", "fs", "-rm", "-f", path])
        except:
            pass
        # save new model on hdfs
        model.save(self.sc, path)
        # print all cluster of the model
        for c in model.clusterCenters:
            l = []
            for i in c:
                i = decimal.Decimal(i).quantize(decimal.Decimal('0.01'))
                l.append(float(i))
            print(l)





    def predict(self, model_name, data):

        '''
        predict unknown data
        :param model_name: the trained model saving on hdfs
        :param data: unknown data
        :return: (cluster_index, cluster)
        '''

        # try to load the specified model
        path = self.base + model_name
        try:
            model = KMeansModel.load(self.sc, path)
        except:
            raise Exception('No such model found on hdfs!')

        # get the predict : means which cluster it belongs to
        index = model.predict(data)
        print('Data:%s belongs to cluster:%s. The index is %s' % (data, model.clusterCenters[index], index))
        return index, model.clusterCenters[index]

    def vaildate(self, validate_data, model_name):

        correct = 0
        error = 0

        for line in validate_data:
            known_cluster = line[0]
            stay_predict_data = line[1]
            predict_cluster = self.predict(model_name, stay_predict_data)[0]  # Only get the index
            if known_cluster == predict_cluster:
                correct += 1
            else:
                error += 1

        total = len(validate_data)
        result = {'total': total, 'correct': correct, 'error': error, 'accurancy': correct / total}

        print(result)
        return result

    def find_best_k(self, df, times, min_k=2, max_k=15):
        l = []

        for i in range(times):
            id = kma.try_different_k(df, max_k)
            l.append(id)

        sql = "SELECT k,avg(SORCE) FROM t_CMMS_TEMP_KMEANS_RESULT  where ID in %s group by K" % str(tuple(l))
        result = self.mysql_helper.fetchall(sql)
        for i in result:
            print(i[0])
        print('\n')
        for i in result:
            print(i[1])


    def print_model(self,model_name):
        # try to load the specified model
        path = self.base + model_name
        try:
            model = KMeansModel.load(self.sc, path)
        except:
            raise Exception('No such model found on hdfs!')

        for c in model.clusterCenters:
            print(c)
        for c in model.clusterCenters:
            l = []
            for i in c:
                i = decimal.Decimal(i).quantize(decimal.Decimal('0.01'))
                l.append(float(i))
            print(l)

    def test_rfm_data(self):
        life_cycle = self.load_from_mysql('t_CMMS_ANALYSE_LIFE').select('CUST_NO', 'LIFE_CYC')
        value = self.load_from_mysql('t_CMMS_ANALYSE_VALUE').select('CUST_NO', 'CUST_VALUE')
        loyalty = self.load_from_mysql('t_CMMS_ANALYSE_LOYALTY').select('CUST_NO', 'LOYALTY')
        rfm = loyalty.join(value, 'CUST_NO', 'left_outer').join(life_cycle, 'CUST_NO', 'left_outer').select('LIFE_CYC',
                                                                                                            'CUST_VALUE',
                                                                                                            'LOYALTY')

        return rfm

    def test_cust_info_data(self):
        return self.load_from_mysql('t_CMMS_TEMP_KMEANS_COLUMNS').select('LIFE_CYC', 'LOYALTY', 'CUST_RANK', 'AGE',
                                                                         'LOCAL', 'SEX', 'AUM')

    def test_credit_data(self):
        return self.load_from_mysql('t_CMMS_TEMP_KMEANS_CREDIT').select('CREDIT', 'CYCLE_TIMES', 'CYCLE_AMT',
                                                                        'INSTALLMENT_TIMES', 'INSTALLMENT_AMT',
                                                                        'SWIPE_TIMES', 'SWIPE_AMT', 'CASH_TIMES',
                                                                        'CASH_AMT')

    def full_keys_data(self):
        return self.load_from_mysql('t_CMMS_TEMP_KMEANS_CREDIT').select('CREDIT', 'CYCLE_TIMES', 'CYCLE_AMT', 'CYCLE_RATE',
                                                                        'INSTALLMENT_TIMES', 'INSTALLMENT_AMT',
                                                                        'SWIPE_TIMES', 'SWIPE_AMT', 'CASH_TIMES',
                                                                        'CASH_AMT')


    def no_cycle_data(self):
        return self.load_from_mysql('t_CMMS_TEMP_KMEANS_CREDIT').select('CREDIT',
                                                                        'INSTALLMENT_TIMES', 'INSTALLMENT_AMT',
                                                                        'SWIPE_TIMES', 'SWIPE_AMT', 'CASH_TIMES',
                                                                        'CASH_AMT')
    def no_amt_data(self):
        return self.load_from_mysql('t_CMMS_TEMP_KMEANS_CREDIT').select('CREDIT', 'CYCLE_TIMES',
                                                                        'CYCLE_RATE',
                                                                        'INSTALLMENT_TIMES',
                                                                        'SWIPE_TIMES',  'CASH_TIMES')

    def no_cycle_amt(self):
        return self.load_from_mysql('t_CMMS_TEMP_KMEANS_CREDIT').select('CREDIT', 'CYCLE_TIMES',
                                                                        'CYCLE_RATE',
                                                                        'INSTALLMENT_TIMES', 'INSTALLMENT_AMT',
                                                                        'SWIPE_TIMES', 'SWIPE_AMT', 'CASH_TIMES',
                                                                        'CASH_AMT')
Ejemplo n.º 12
0
class Credit:
    def __init__(self):
        self.conf = (SparkConf()
                     .setAppName("CREDIT")
                     .set("spark.cores.max", "2")
                     .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
        self.sc = SparkContext(conf=self.conf)
        self.sqlctx = SQLContext(self.sc)
        self.mysql_helper = MySQLHelper('core', host='10.9.29.212')
        self.base = 'hdfs://master:9000/gmc/'

    def load_from_mysql(self, table, database='core'):
        url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database
        df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load()
        return df

    def sql_operate(self, sql, rdd, once_size=1000):
        temp = []
        for row in rdd.collect():
            # print(row)
            if len(temp) >= once_size:
                self.mysql_helper.executemany(sql, temp)
                temp.clear()
            temp.append(row)

        if len(temp) != 0:
            self.mysql_helper.executemany(sql, temp)
            temp.clear()

    def prepare_fpgrowth_data(self):
        tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').filter("BILL_AMTFLAG = '+'").select('ACCTNBR',
                                                                                                 'MER_CAT_CD') \
            .filter("MER_CAT_CD != 0").filter("MER_CAT_CD != 6013")

        result = tran_df.map(lambda x: (str(int(x['ACCTNBR'])), [str(int(x['MER_CAT_CD'])), ])).groupByKey()

        def m(x):
            k = x[0]
            l = list(x[1])

            v = set()
            for i in l:
                v.add(i[0])

            return set(v)

        result = result.map(m)
        for i in result.take(10):
            print(i)

        model = FPGrowth.train(result, minSupport=0.05, numPartitions=10)
        result = model.freqItemsets().collect()
        for r in result:
            print(r)

    def cycle_credit(self):
        '''
        信用卡聚类数据预处理
        :return:
        '''

        print('---------------------------信用卡-Start--------------------------')
        # 交易流水
        credit_tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').select('ACCTNBR', 'MONTH_NBR', 'BILL_AMT',
                                                                           'BILL_AMTFLAG').filter(
            "BILL_AMTFLAG ='-'").cache()

        # 卡账户信息
        credit_acct_df = self.load_from_mysql('ACCT_D').select('ACCTNBR', 'MONTH_NBR', 'STM_MINDUE')

        # 还款计算
        return_amt = credit_tran_df.groupBy('ACCTNBR', 'MONTH_NBR').sum('BILL_AMT')
        return_amt = return_amt.select('ACCTNBR', 'MONTH_NBR', return_amt['sum(BILL_AMT)'].alias('RETURNED'))

        # 去除0最低还款额,即未消费的账单月
        join = credit_acct_df.join(return_amt, ['ACCTNBR', 'MONTH_NBR'], 'outer').filter('STM_MINDUE != 0')

        # 清除缓存
        self.sqlctx.clearCache()

        def which_cycle_type(line):
            mindue = line['STM_MINDUE']
            returned = line['RETURNED']

            '''
            0:normal,all returned
            1:cycle credit
            2:overdue,don't return money
            '''
            if mindue is not None and returned is None:
                flag = 2
            elif returned >= mindue * 10:
                flag = 0
            elif returned > mindue and returned < mindue * 10:
                flag = 1
            else:
                flag = 9

            return Row(ACCTNBR=int(line['ACCTNBR']), MONTH_NBR=line['MONTH_NBR'], DUE_FLAG=flag,
                       STM_MINDUE=line['STM_MINDUE'])

        # 返回为PipelinedRDD
        join = join.map(which_cycle_type)

        # 转为DataFrame
        join = self.sqlctx.createDataFrame(join)
        '''
        +-------+--------+-----+
        | ACCTNBR | DUE_FLAG | count |
        +-------+--------+-----+
        | 608126 |    2 |    1 |
        | 608126 |    0 |    6 |
        | 608868 |    0 |    4 |
        '''
        # 按还款类型分类
        each_type = join.groupBy(['ACCTNBR', 'DUE_FLAG'])

        # 计算每种还款情况数量
        each_type_count = each_type.count()

        # 计算每种还款情况的最低还款额之和
        each_type_mindue_sum = each_type.sum('STM_MINDUE')

        # 计算还款情况总数
        all_type_count = each_type_count.groupBy('ACCTNBR').sum('count')

        # join 上述三表
        rate = each_type_count.join(each_type_mindue_sum, ['ACCTNBR', 'DUE_FLAG'], 'outer').join(all_type_count,
                                                                                                 'ACCTNBR', 'outer')

        # print(rate.columns)
        # ['ACCTNBR', 'DUE_FLAG', 'count', 'sum(STM_MINDUE)', 'sum(count)']

        # 筛选出循环信用的数据
        # TODO 暂时只取了循环信用的

        rate = rate.filter(rate['DUE_FLAG'] == 1)

        # 计算进入循环信用的比例
        rate = rate.select('ACCTNBR',
                           (rate['sum(STM_MINDUE)'] * 10).alias('CYCLE_AMT'),
                           rate['count'].alias('CYCLE_TIMES'),
                           (rate['count'] / rate['sum(count)']).alias('CYCLE_RATE'))

        # rate.show()
        # print(rate.count())


        def m(line):
            return line['CYCLE_TIMES'], line['CYCLE_AMT'], line['CYCLE_RATE'], line['ACCTNBR']

        sql = "update t_CMMS_TEMP_KMEANS_CREDIT set CYCLE_TIMES=%s,CYCLE_AMT=%s,CYCLE_RATE=%s where ACCTNBR=%s"

        df = rate.map(m)

        print('将数据更新至数据库...')
        self.sql_operate(sql, df)

        # 将未进入循环的 设为0
        print('将未进入循环的 设为0...')
        self.mysql_helper.execute(
            "update t_CMMS_TEMP_KMEANS_CREDIT set CYCLE_TIMES=0,CYCLE_AMT=0,CYCLE_RATE=0 where CYCLE_TIMES is null ")

    def losing_warn(self,year,month):



        # #计算月份
        # if season == 1:
        #     months_now = [1,2,3]
        #     months_before = [10,11,12]
        #     year_before = year - 1
        # else:
        #     months_now = [season * 3 - 2, season * 3 - 1, season * 3]
        #     months_before = [season * 3 - 5, season * 3 - 4, season * 3-3]
        #     year_before = year
        #
        #
        #
        # # 抽取每个季度数据
        #
        # for m in months_now:

        # 最近一个月
        month = '%02d' % month


        for i in range(2,29):
            day = '%02d' % i
            date = str(year) + month + day
            print(date)

            sql = "select MONTH_NBR from t_CMMS_CREDIT_TRAN where INP_DATE = %s limit 1"

            try:
                month_nbr = self.mysql_helper.fetchone(sql,(date,))
                if month_nbr is None:
                    continue
                else:
                    month_nbr = int(month_nbr[0])
                    break
            except Exception:
                continue

        if month_nbr is None:
            raise Exception("There is no data in database for month:%s" % month)
        else:
            print('the latest month_nbr is %s' % month_nbr)


        months_now = [month_nbr-2,month_nbr-1,month_nbr]
        months_before = [month_nbr-5,month_nbr-4,month_nbr-3]

        print('months_now',months_now)
        print('months_before',months_before)


        # 交易流水
        credit_tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').select('ACCTNBR', 'INP_DATE', 'MONTH_NBR', 'BILL_AMT',
                                                                           'BILL_AMTFLAG').filter("BILL_AMTFLAG ='+'").cache()



        # 筛选出两个季度对应的流水

        months_now_filter = None
        months_before_filter = None

        for i in months_now:
            f = credit_tran_df.filter(credit_tran_df['MONTH_NBR'] == i)
            if months_now_filter is None:
                months_now_filter = f
            else:
                months_now_filter = months_now_filter.unionAll(f)

        for i in months_before:
            f = credit_tran_df.filter(credit_tran_df['MONTH_NBR'] == i)
            if months_before_filter is None:
                months_before_filter = f
            else:
                months_before_filter = months_before_filter.unionAll(f)


        months_now_filter.groupBy('MONTH_NBR').count().show()
        months_before_filter.groupBy('MONTH_NBR').count().show()


        months_now_count = months_now_filter.groupBy('ACCTNBR').count()
        months_before_count = months_before_filter.groupBy('ACCTNBR').count()

        months_now_count.show()
        months_before_count.show()


        join = months_now_count.select('ACCTNBR',months_now_count['count'].alias('NOW_COUNT')).join(
            months_before_count.select('ACCTNBR',months_before_count['count'].alias('BEFORE_COUNT')),'ACCTNBR','outer'
        )

        join.show()

        def m(line):
            ncount = line['NOW_COUNT']
            bcount = line['BEFORE_COUNT']
            '''
            计算增长率
            9999:两季度均无数据
            8888:仅第一季度有数据,流失客户
            7777:仅第二季度有数据,新增客户
            其他数字:第二个季度较第一季度增长率 (s2-s1)/s1*100
            '''
            if ncount is None:
                if bcount is None:# n none,b none
                    pass
                else:# n none, b not none
                    increment = -9999
            else:
                if bcount is None:# n not none, b none
                    increment = 8888
                else:# n not none,b not none
                    increment = round((ncount-bcount)/bcount*100)

            '''
            计算信用卡生命周期(以增长率计算)
            100+ 快速增长
            50-100 增长
            -50-50 稳定
            -50- 衰退
            9999 流失

            '''
            if increment > 100 and increment != -9999:
                life = 1  # fast growing
            elif increment <=100 and increment >50:
                life = 2  # growing
            elif increment <= 50 and increment > -50:
                life = 3  # stable
            elif increment <= -50:
                life = 4  # losing
            else:
                life = 9  # no more tran  completely lost

            return line['ACCTNBR'],month_nbr,increment,life




        sql = "replace into t_CMMS_ANALYSE_CREDIT(ACCTNBR, MONTH_NBR, INCREMENT,LIFE, UPDATE_TIME) values(%s,%s,%s,%s,now())"

        rdd = join.map(m)
        print(type(rdd))

        self.sql_operate(sql,rdd)