Beispiel #1
0
    def __init__(self, ptid=None, begin=None, end=None):
        #读取配置文件
        self.reader = gd.Reader('GeoIP2-City.mmdb')
        self.cf = ConfigParser.ConfigParser()
        self.cf.read('imsiConfig2.conf')

        self.redshift = RedShift()
        self.mysql = Mysql()

        #初始化开始结束日期,初始化产品id
        self.today = end if end else datetime.date.today().strftime("%Y-%m-%d")
        self.yesterday = begin if begin else (
            datetime.date.today() -
            datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        self.before_yesterday = (
            datetime.datetime.strptime(str(self.yesterday), "%Y-%m-%d") +
            datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
        self.date_7day_ago = (
            datetime.datetime.strptime(str(self.yesterday), "%Y-%m-%d") +
            datetime.timedelta(days=-7)).strftime("%Y-%m-%d")
        self.ptid = ptid if ptid else '600025'
        print self.today
        print self.yesterday
        print self.ptid

        self.day_yesterday = self.yesterday.replace('-', '')  #20100101
        self.day_before_yesterday = self.getPlusDay(self.yesterday,
                                                    -1)  #20100101
        self.day_7day_ago = self.getPlusDay(self.yesterday, -7)  #20100101

        #构建数据查询sql
        self.sql_channel_cnt = "select media_source,count(distinct token) from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null group by media_source;"
        self.sql_channel_ip_cnt = "select media_source,sum(cnt) from (select media_source,ip,count(distinct token) as cnt from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null  group by media_source,ip having count(distinct token)>=5) group by media_source;"
        self.sql_channel_ip_imsi = "select media_source,country_iso,city,carrier,ip,split_part(imsi,'_',2) as imsi from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null ;"
        self.sql_channel_imsi_cnt = "select media_source,sum(case when lenimsi=15 then 1 else 0 end) as cnt from (select distinct media_source,token,len(split_part(imsi,'_',2)) as lenimsi from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null ) t group by media_source;"
        self.sql_channel_imei_cnt = "select media_source,sum(case when lenimei=15 then 1 else 0 end) as cnt from (select distinct media_source,token,len(imei) as lenimei from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null ) t group by media_source;"
        self.sql_channel_hour_cnt = "select media_source,date_part(hour, install_time),count(distinct token) from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null  group by media_source,date_part(hour, install_time);"
        self.sql_channel_wifi_cnt = "select media_source,sum(case when wifi='t' then cnt else 0 end) as cnt from (select media_source,wifi,count(distinct token) as cnt from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null group by media_source,wifi) t group by media_source;"
        self.sql_channel_ver_cnt = "select media_source,ver,count(distinct token) from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null  group by media_source,ver;"
        self.sql_channel_device_cnt = "select media_source,device_brand,count(distinct token) from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null  group by media_source,device_brand;"
        self.sql_channel_IPseg_cnt = "select media_source,sum(cnt) from (select media_source,ip,count(token) as cnt from (select distinct media_source,token,split_part(ip,'.',1)||'.'||split_part(ip,'.',2)||'.'||split_part(ip,'.',3) as ip from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null ) t group by media_source,ip having count(token)>=5) t2 group by media_source;"
        self.sql_channel_date = "select distinct media_source,'" + self.yesterday + "','" + self.ptid + "' from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null;"
        #构建t分布预测模型
        y = np.array([0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.98])
        x = np.array([0.674, 0.842, 1.036, 1.282, 1.645, 1.960, 2.326])
        x = x.reshape(x.shape[0], 1)
        X = self.buildComplexDataset(x)

        alpha = 0.001
        l1_ratio = 0.001
        enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
        self.model = enet.fit(X, y)
Beispiel #2
0
    def __init__(self, today=None):
        #读取配置文件

        self.redshift = RedShift()
        self.mysql = Mysql()

        #初始化开始结束日期,初始化产品id
        self.today = today if today else datetime.date.today().strftime("%Y-%m-%d")
        self.yesterday = (datetime.datetime.strptime(str(self.today),"%Y-%m-%d") + datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
        self.day_before_yesterday = (datetime.datetime.strptime(str(self.today),"%Y-%m-%d") + datetime.timedelta(days=-2)).strftime("%Y-%m-%d")
        
        print self.today
        print self.yesterday

        self.yesterday_day = self.yesterday.replace('-','')
        
        self.begin_date = '2017-06-14'
        self.begin_day = self.begin_date.replace('-','')

        self.spec_cond = self.getSpecCondition()

        #构建数据查询sql

        self.purchase_sql = "select date,revenue_day,product_id,classify_group,classify,country_iso,platform,media_source,c_id,c_name,count(distinct token) as pay_user,sum(revenue) as purchase from (select replace(created_time,'-','') as date,replace(log_date,'-','') as revenue_day,classify_group,classify,country_iso,product_id,platform,token,media_source,nvl(c_id,'') as c_id,nvl(nvl(f.campaign_name,campaign),'') as c_name,revenue from  (select created_time,c.log_date,classify_group,classify,c.country_iso,c.product_id,c.platform,token,media_source,c_id,campaign,d.revenue from (select created_time,log_date,classify_group,classify,country_iso,product_id,platform,a.token,media_source,nvl(fb_campaign_id,af_c_id) as c_id,campaign from (select trunc(server_time) as log_date,country_iso,product_id,token,case when sta_value like '%ca-app-pub%' then 'admob' else 'fb' end platform from sta_event_game_publish where server_time>='"+self.day_before_yesterday+"' and server_time<'" + self.yesterday + "' and sta_key in ('_RDA_REWARDOK','_NEW_RDA_REWARDOK')) b , (select distinct created_time,classify_group,classify,t1.gaid,t2.token,media_source,fb_campaign_id,af_c_id,campaign from (select trunc(created_time) as created_time,substring(regexp_substr(grp,'_[^.]*'),len(regexp_substr(grp,'_[^.]*')),1) as classify,substring(regexp_substr(grp,'_[^.]*'),2,len(regexp_substr(grp,'_[^.]*'))-3) as classify_group,split_part(device_info,'#',1) as gaid,split_part(grp,'_',1) as productname from adsdk_abtest_info where created_time>='"+self.begin_date+"' and created_time<'" + self.yesterday + "' and length(split_part(device_info,'#',1))>10 ) t1 ,(select trunc(install_time) as install_time,token,product_id,advertising_id,media_source,fb_campaign_id,af_c_id,campaign from view_appsflyer_postback_android_install_token where install_time>='"+self.begin_date+"' and install_time<'" + self.yesterday + "' and advertising_id is not null) t2,product p where t1.created_time=t2.install_time and t1.gaid=t2.advertising_id and t1.productname=p.pkg and p.pid = t2.product_id) a where b.token=a.token) c left outer join (select log_date,country_iso,product_id,platform,revenue/cnt_all as revenue from (select log_date,country_iso,product_id,platform,count(*) as cnt_all from (select trunc(server_time) as log_date,country_iso,product_id,token,case when sta_value like '%ca-app-pub%' then 'admob' else 'fb' end platform from sta_event_game_publish where server_time>='"+self.day_before_yesterday+"' and server_time<'" + self.yesterday + "' and sta_key in ('_RDA_REWARDOK','_NEW_RDA_REWARDOK') )t group by log_date,country_iso,product_id,platform) a, (select pid,country,revenue,day,channel,ad_type from game_roi_ad_revenue where day='"+self.day_before_yesterday+"' and revenue>0) b where a.log_date=b.day and a.product_id=b.pid and a.country_iso=b.country and a.platform=b.channel) d on c.log_date=d.log_date and c.product_id=d.product_id and c.country_iso=d.country_iso and c.platform=d.platform) e left outer join (select date,campaign_id,campaign_name from view_googleadwords_int where date>=" + self.begin_day + " group by date,campaign_id,campaign_name) f on e.c_id=f.campaign_id and replace(e.created_time,'-','')=f.date) h group by date,revenue_day,classify_group,classify,country_iso,product_id,platform,media_source,c_id,c_name;"

        self.purchase_sql_insert = "insert into ROI_Purchase_ABtest(date,revenue_day,pdtid,classify_group,classify,country_iso,platform,pid,campaign_id,c,pay_user,purchase) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE pay_user=values(pay_user),purchase=values(purchase);"

        self.retain_sql = "select date,'"+self.yesterday_day+"',product_id,classify_group,classify,country_iso,media_source,c_id,campaign,retain_user from (select date,product_id,classify_group,classify,country_iso,media_source,c_id,campaign,count(distinct token) as retain_user from (select replace(created_time,'-','') as date,a.product_id,classify_group,classify,a.country_iso,media_source,nvl(c_id,'') as c_id,nvl(campaign,'') as campaign,b.token from (select created_time,classify_group,classify,product_id,country_iso,t1.gaid,t2.token,media_source,nvl(fb_campaign_id,af_c_id) as c_id,campaign from (select distinct trunc(created_time) as created_time,substring(regexp_substr(grp,'_[^.]*'),len(regexp_substr(grp,'_[^.]*')),1) as classify,substring(regexp_substr(grp,'_[^.]*'),2,len(regexp_substr(grp,'_[^.]*'))-3) as classify_group, split_part(device_info,'#',1) as gaid,split_part(grp,'_',1) as productname from adsdk_abtest_info where created_time>='"+self.begin_date+"' and created_time<'" + self.today + "' and length(split_part(device_info,'#',1))>10 ) t1, (select trunc(install_time) as install_time,token,product_id,country_iso,advertising_id,media_source,fb_campaign_id,af_c_id,campaign from view_appsflyer_postback_android_install_token where install_time>='"+self.begin_date+"' and install_time<'" + self.today + "' and advertising_id is not null)t2, product p where t1.created_time = t2.install_time and t1.gaid=t2.advertising_id and t1.productname=p.pkg and p.pid = t2.product_id) a left join (select distinct trunc(server_time) as server_time,country_iso,token,product_id from sta_event_game_publish where server_time>='" + self.yesterday + "' and server_time<'" + self.today + "' and (product_id=600020 and sta_key not in ('T0I','T0J','_CI') and sta_key not like '_NEW%')) b on a.token = b.token and a.product_id=b.product_id) c group by date,product_id,classify_group,classify,country_iso,media_source,c_id,campaign) d where retain_user>0;"


        self.retain_sql_insert = "insert into ROI_Retained_ABtest(date,retain_day,pdtid,classify_group,classify,country_iso,pid,campaign_id,c,spec_retain) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE spec_retain=values(spec_retain);"


        self.install_sql = "select a.date,product_id,classify_group,classify,country_iso,media_source,nvl(c_id,''),nvl(nvl(f.campaign_name,campaign),'') as c_name,install_num from (select replace(created_time,'-','') as date,product_id,classify_group,classify,country_iso,media_source,nvl(fb_campaign_id,af_c_id) as c_id,campaign,count(distinct t2.token) as install_num from (select distinct trunc(created_time) as created_time,substring(regexp_substr(grp,'_[^.]*'),len(regexp_substr(grp,'_[^.]*')),1) as classify,substring(regexp_substr(grp,'_[^.]*'),2,len(regexp_substr(grp,'_[^.]*'))-3) as classify_group,split_part(device_info,'#',1) as gaid,split_part(grp,'_',1) as productname from adsdk_abtest_info where created_time>='" + self.yesterday + "' and created_time<'" + self.today + "' and length(split_part(device_info,'#',1))>10 ) t1,(select trunc(install_time) as install_time,token,product_id,country_iso,advertising_id,media_source,fb_campaign_id,af_c_id,campaign from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and advertising_id is not null)t2,product p where t1.gaid=t2.advertising_id and t1.productname=p.pkg and p.pid = t2.product_id group by created_time,product_id,classify_group,classify,country_iso,media_source,nvl(fb_campaign_id,af_c_id),campaign) a left outer join (select date,campaign_id,campaign_name from view_googleadwords_int where date="+self.yesterday_day+" group by date,campaign_id,campaign_name) f on a.c_id=f.campaign_id and a.date=f.date;"

        self.install_sql_insert = "insert into ROI_install_ABtest(date,pdtid,classify_group,classify,country_iso,pid,campaign_id,c,install_num) values(%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE install_num=values(install_num)"
Beispiel #3
0
class ActiveFraud(object):
    """docstring for RedShift"""
    def __init__(self, begin=None, end=None):
        #读取配置文件
        self.reader = gd.Reader('GeoIP2-City.mmdb')
        self.cf = ConfigParser.ConfigParser()
        self.cf.read('imsiConfig2.conf')

        self.redshift = RedShift()
        self.mysql = Mysql()

        #初始化开始结束日期,初始化产品id
        self.today = end if end else datetime.date.today().strftime("%Y-%m-%d")
        self.yesterday = begin if begin else (
            datetime.date.today() -
            datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        self.before_yesterday = (
            datetime.datetime.strptime(str(self.yesterday), "%Y-%m-%d") +
            datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
        self.date_7day_ago = (
            datetime.datetime.strptime(str(self.yesterday), "%Y-%m-%d") +
            datetime.timedelta(days=-7)).strftime("%Y-%m-%d")

        self.ptid = '600025'
        print self.today
        print self.yesterday

        self.day_yesterday = self.yesterday.replace('-', '')  #20100101
        self.day_before_yesterday = self.getPlusDay(self.yesterday,
                                                    -1)  #20100101
        self.day_7day_ago = self.getPlusDay(self.yesterday, -7)  #20100101

        #构建数据查询sql
        self.sql_channel_cnt = "select product_id,media_source,count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000  and token is not null group by product_id having(count(distinct media_source)>1)) and token is not null group by media_source,product_id;"

        self.sql_channel_ip_cnt = "select product_id,media_source,sum(cnt) from (select product_id,media_source,ip,count(distinct token) as cnt from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000  and token is not null group by product_id having(count(distinct media_source)>1)) and token is not null group by product_id,media_source,ip having count(distinct token)<5) group by product_id,media_source;"

        self.sql_channel_ip_imsi = "select media_source,country_iso,city,carrier,ip,split_part(imsi,'_',2) as imsi from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null ;"

        self.sql_channel_imsi_cnt = "select product_id,media_source,sum(case when lenimsi=15 then 1 else 0 end) as cnt from (select distinct product_id,media_source,token,len(split_part(imsi,'_',2)) as lenimsi from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and  product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000  and token is not null group by product_id having(count(distinct media_source)>1)) and token is not null ) t group by product_id,media_source;"

        self.sql_channel_imei_cnt = "select product_id,media_source,sum(case when lenimei=15 then 1 else 0 end) as cnt from (select distinct product_id,media_source,token,len(imei) as lenimei from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000  and token is not null group by product_id having(count(distinct media_source)>1)) and token is not null ) t group by product_id,media_source;"

        self.sql_channel_hour_cnt = "select product_id,media_source,date_part(hour, install_time),count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and token is not null and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000  and token is not null group by product_id having(count(distinct media_source)>1))group by product_id,media_source,date_part(hour, install_time);"

        self.sql_channel_wifi_cnt = "select product_id,media_source,sum(case when wifi='t' then cnt else 0 end) as cnt from (select product_id,media_source,wifi,count(distinct token) as cnt from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000  and token is not null group by product_id having(count(distinct media_source)>1)) and token is not null group by product_id,media_source,wifi) t group by product_id,media_source;"

        self.sql_channel_ver_cnt = "select media_source,ver,count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null  group by media_source,ver;"

        self.sql_channel_device_cnt = "select product_id,media_source,device_brand,count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000  and token is not null group by product_id having(count(distinct media_source)>1)) and token is not null group by product_id,media_source,device_brand;"

        self.sql_channel_IPseg_cnt = "select media_source,sum(cnt) from (select media_source,ip,count(token) as cnt from (select distinct media_source,token,split_part(ip,'.',1)||'.'||split_part(ip,'.',2)||'.'||split_part(ip,'.',3) as ip from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null ) t group by media_source,ip having count(token)>=5) t2 group by media_source;"

        self.sql_channel_date = "select distinct product_id,media_source,'" + self.yesterday + "' from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000 and token is not null and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000  and token is not null group by product_id having(count(distinct media_source)>1));"

        #构建t分布预测模型
        y = np.array([0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.98])
        x = np.array([0.674, 0.842, 1.036, 1.282, 1.645, 1.960, 2.326])
        x = x.reshape(x.shape[0], 1)
        X = self.buildComplexDataset(x)

        alpha = 0.001
        l1_ratio = 0.001
        enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
        self.model = enet.fit(X, y)

    def getPlusDay(self, day, plus):
        date = datetime.datetime.strptime(
            str(day), "%Y-%m-%d") + datetime.timedelta(days=plus)
        string = date.strftime("%Y%m%d")
        return string

    def getPlusDate(self, date, plus):
        return (datetime.datetime.strptime(str(date), "%Y-%m-%d") +
                datetime.timedelta(days=plus)).strftime("%Y-%m-%d")

    def insertLoyaltyUser(self, ):

        sql_loyalty = "select install_time,media_source,product_id,sum(uncheat) as loyuser,sum(uncheat)*1.0/count(*) as loyrate from (select install_time,product_id,token,media_source, case when cnt_all>=100 then 1 when cnt_bg*1.0/cnt_all>0.4 then 0 else 1 end as uncheat from (select install_time,product_id,token,media_source, sum(cnt_key) as cnt_all, sum(case t3.sta_key when '_' then cnt_key else 0 end) as cnt_bg from  (select trunc(t1.install_time) as install_time,t1.product_id,t1.token,t1.media_source,t2.sta_key,count(*) as cnt_key from (select media_source,product_id,token,install_time from view_appsflyer_postback_android_install_token where  install_time>='" + self.yesterday + "'  and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "'  and install_time<'" + self.today + "' and product_id>600000  and token is not null group by product_id having(count(distinct media_source)>1))) t1 left outer join (select product_id,token,substring(sta_key,1,1) as sta_key,server_time from sta_event_game_publish where product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "'  and product_id>600000  and token is not null group by product_id having(count(distinct media_source)>1)) and server_time>='" + self.yesterday + "' and server_time<'" + self.today + "') t2 on t1.token=t2.token and t1.product_id=t2.product_id group by trunc(t1.install_time),t1.product_id,t1.token,t1.media_source,t2.sta_key) t3  group by install_time,product_id,token,media_source ) t4 ) t5 group by install_time,product_id,media_source;"
        #print sql_loyalty
        sql_insert = "insert into active_fraud_analysis(day,channel,product_id,loyalty_users,loyalty_users_rate) values(%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE loyalty_users=values(loyalty_users),loyalty_users_rate=values(loyalty_users_rate)"

        raw = self.redshift.getAll(sql_loyalty)
        self.mysql.insertMany(sql_insert, raw)

    def insertRetain(self):
        """
        把隔日留存数据插进去
        """

        sql = "INSERT INTO active_fraud_analysis (day,channel,product_id,retain) select * from (select str_to_date(t1.date, '%Y%m%d') as day,t1.pid,t1.pdtid,(case when t2.cnt is null then 0 else t2.cnt end)*1.0/t1.cnt as rate from (select date,pid,pdtid,sum(match_num) cnt from ROI_install where date='" + self.day_before_yesterday + "' and pdtid in (select pdtid from ROI_install where date='" + self.day_before_yesterday + "' and pdtid>600000 group by pdtid having(count(distinct pid)>1)) group by pid,pdtid) t1 left outer join (select date,pid,pdtid,retain_day,sum(spec_retain) cnt from ROI_Retained where date='" + self.day_before_yesterday + "' and retain_day='" + self.day_yesterday + "' and pdtid in (select pdtid from ROI_Retained where date='" + self.day_before_yesterday + "' and retain_day='" + self.day_yesterday + "' and pdtid>600000 group by pdtid having(count(distinct pid)>1)) group by pid,pdtid) t2 on t1.date=t2.date and t1.pid=t2.pid and t1.pdtid=t2.pdtid) t where rate is not null on DUPLICATE KEY UPDATE retain=rate;"
        #print sql
        self.mysql.insertOne(sql, None)

    def insertAvgReturn7day(self):

        sql = "INSERT INTO active_fraud_analysis (day,channel,product_id,avg_return_7day) select * from (select str_to_date(t1.date, '%Y%m%d') as day,t1.pid,t1.pdtid,t2.cnt/(datediff(str_to_date('" + self.day_yesterday + "','%Y%m%d'),str_to_date(t1.date, '%Y%m%d'))*t1.cnt) rate from (select date,pid,pdtid,sum(match_num) cnt from ROI_install where pdtid in (select pdtid from ROI_install where date>='" + self.day_7day_ago + "' and date<'" + self.day_yesterday + "' and pdtid>600000 group by pdtid having(count(distinct pid)>1)) and date>='" + self.day_7day_ago + "' and date<'" + self.day_yesterday + "' group by pid,pdtid,date) t1 left outer join (select date,pid,pdtid,sum(cnt) as cnt from (select date,pid,pdtid,retain_day,sum(spec_retain) cnt from ROI_Retained where pdtid in (select pdtid from ROI_Retained where date>='" + self.day_7day_ago + "' and retain_day>date and retain_day<='" + self.day_yesterday + "' and pdtid>600000 group by pdtid having(count(distinct pid)>1)) and date>='" + self.day_7day_ago + "' and retain_day>date and retain_day<='" + self.day_yesterday + "' group by pdtid,pid,retain_day,date) t group by date,pid,pdtid) t2 on t1.date=t2.date and t1.pid=t2.pid and t1.pdtid=t2.pdtid) t3 where rate is not null on DUPLICATE KEY UPDATE avg_return_7day=rate;"
        #print sql

        self.mysql.insertOne(sql, None)

    def insertScore(self):

        sql_new = "select day,channel,product_id,ifnull(loyalty_users_rate,0),ifnull(retain,0),ifnull(avg_return_7day,0),ifnull(net_rate,0),ifnull(imsi_rate,0),ifnull(imei_rate,0),ifnull(hour_rate,0),ifnull(device_rate,0) from active_fraud_analysis where install>50 and day>='" + self.date_7day_ago + "' and day<'" + self.today + "' ;"
        sql_insert = "insert into active_fraud_analysis(day,channel,product_id,score) values(%s,%s,%s,%s) ON DUPLICATE KEY UPDATE score=values(score)"

        raw = self.mysql.getAll(sql_new)

        if not raw:
            sys.exit()

        for row in raw:

            # 留存为空的不处理
            if row[4] == 0.0:
                continue
            value = [row[i] for i in range(3)]
            rate = [float(row[i]) for i in range(3, len(row))]

            score = reduce(lambda x, y: x * y,
                           np.array(rate)) * 10000 / (row[4] / row[5])**3
            if row[1] == 'Organic':
                score = 100.0

            value.extend([score])
            self.mysql.insertOne(sql_insert, value)

    def modifyScore(self):

        sql = "update active_fraud_analysis a, (select t2.day,t2.channel,t2.product_id,score*90/benchmark as score from (select day,product_id,max(score) as benchmark from active_fraud_analysis where install>50 and channel<>'Organic' and score>0  and day>='" + self.date_7day_ago + "' group by day,product_id) t1,(select day,channel,product_id,score from active_fraud_analysis where install>50 and channel<>'Organic' and score>0 and day>='" + self.date_7day_ago + "') t2 where t1.day=t2.day and t1.product_id=t2.product_id) b set a.score=b.score where a.day=b.day  and a.product_id=b.product_id and a.channel=b.channel;"
        #print sql
        self.mysql.update(sql)

    def sigmoid(self, X, useStatus):
        """
        归一化处理,把比率数据归一化到[0.5,1]
        """
        if useStatus:
            return 1.0 / (1 + np.exp(-(X)))
        else:
            return X

    def makeDF(self, dt, ind=False, col=['key_p', 'key_m', 'cnt']):
        """
        数据以列表形式保存,其中的元素为元祖  [('a',1),('b',2)]
        """
        if ind:
            indexs = [i[0:2] for i in dt]
            data = [i[2:] for i in dt]
            df = pd.DataFrame(data, index=indexs, columns=col)
        else:
            df = pd.DataFrame(dt, columns=col)
        return df

    def analysisIP_IMSI(self):
        """
        分析IP和IMSI的一致性,获取原始数据
        """
        raw = self.redshift.getAll(self.sql_channel_ip_imsi)
        result = map(self.judgeIP_IMSI, raw)
        a = [i for i in result if i[1] == 0]
        b = [i for i in result if i[1] == 1]
        dfa = self.makeDF(a, False, ['key', 'cnt_IPIMSI'])
        c = dfa.groupby('key').count()
        #dfb = self.makeDF(b, False, ['key','cntIPIMSI1'])
        #d = dfb.groupby('key').count()
        df = pd.concat([c], axis=1)

        return df

    def judgeIP_IMSI(self, l):
        """
        分析IMSI MCC和MNC白名单,验证其和IP国家的一致性。
        """
        media_source, country_iso, city, carrier, ip, imsi = l
        if not imsi or len(imsi) < 15:
            return (media_source, 0)
        try:
            response = self.reader.city(ip)
            #英文名
            ipiso = response.country.iso_code  #TW
            mcc = imsi[0:3]
            if self.cf.has_option('DB', mcc):
                value = self.cf.get('DB', mcc)
                countryList = []
                mncList = value.split(',')[1:]
                if imsi[3:5] in mncList or imsi[3:6] in mncList:
                    countryList = value.split(',')[0].split('#')
                    if country_iso == ipiso and ipiso in countryList:
                        return (media_source, 1)
        except Exception as err:
            print err
            return (media_source, 0)
        return (media_source, 0)

    def buildComplexDataset(self, x):
        temp = 1.0 / x
        X = np.hstack((x, temp))
        return X

    def confidenceIntervalEstimation(self, l):
        """
        根据待评估数据的结果,评估其距organic的距离,计算器可信度。
        """
        media_source, n, u0, u, tag, rate = l
        if n < 50:
            #print [media_source, None]
            return [media_source, 0.01]
        if tag and u <= u0:
            return [media_source, 1.0]
        if u0 == u:
            return [media_source, 1.0]
        sigm = (u0 * (1 - u0))**rate
        se = sigm / (n**rate)

        x = abs(u - u0) / (se + random.random() / 10)

        X_predict = np.array([x])
        X_predict = X_predict.reshape(X_predict.shape[0], 1)
        X_predict = self.buildComplexDataset(X_predict)

        pred = self.model.predict(X_predict)[0]
        result = min(1, max(0.01, 1 - pred))

        #print [media_source, result]
        return [media_source, result]

    def analysisHour(self):
        """
        统计各渠道各小时的激活数目,以organic为基准,计算人数最多的三个小时占比,统计其他渠道对应3小时的比例,用置信区间95%来衡量渠道的真实性。
        """
        raw = self.redshift.getAll(self.sql_channel_hour_cnt)
        #排序选出最大的三个小时
        if not raw:
            sys.exit(1)
        product_set = set([i[0] for i in raw])
        result = pd.DataFrame()
        for p in product_set:
            rawOrganic = [i for i in raw if i[1] == 'Organic' and i[0] == p]
            rawOrganic.sort(key=lambda x: -x[3])

            organicMost = [
                rawOrganic[i][2] for i in range(min(3, len(rawOrganic)))
            ]
            #分别求总数和前三个月聚合累加
            raw3most = [(i[0], i[1], i[3]) for i in raw
                        if i[2] in organicMost and i[0] == p]
            #rawAll = [(i[1],i[3]) for i in raw]

            df3most = self.makeDF(raw3most, False,
                                  ['key_p', 'key_m', 'cnt_hour'])
            df3mostSum = df3most.groupby(['key_p', 'key_m']).sum()
            #print df3mostSum
            #print "df3mostSum"
            #dfAll = self.makeDF(rawAll, False, ['key2', 'cnt_hour_all'])
            #dfAllSum = dfAll.groupby('key2').sum()

            result = pd.concat([result, df3mostSum], axis=0)

        return result

    def analysisDevice(self):
        """
        分析设备分布
        """
        raw = self.redshift.getAll(self.sql_channel_device_cnt)
        if not raw:
            sys.exit(1)
        product_set = set([i[0] for i in raw])
        result = pd.DataFrame()

        for p in product_set:
            rawOrganic = [i for i in raw if i[1] == 'Organic' and i[0] == p]
            rawOrganic.sort(key=lambda x: -x[3])

            organicMost = [
                rawOrganic[i][2] for i in range(min(3, len(rawOrganic)))
            ]
            #分别求总数和前三个月聚合累加
            raw3most = [(i[0], i[1], i[3]) for i in raw
                        if i[2] in organicMost and i[0] == p]
            #rawAll = [(i[1],i[3]) for i in raw]

            df3most = self.makeDF(raw3most, False,
                                  ['key_p', 'key_m', 'cnt_hour'])
            df3mostSum = df3most.groupby(['key_p', 'key_m']).sum()
            #print df3mostSum
            #print "df3mostSum"
            #dfAll = self.makeDF(rawAll, False, ['key2', 'cnt_hour_all'])
            #dfAllSum = dfAll.groupby('key2').sum()

            result = pd.concat([result, df3mostSum], axis=0)

        return result

    def analysisModel(self, sql, index, col):
        #print sql
        raw = self.redshift.getAll(sql)
        df = self.makeDF(raw, index, col)
        return df

    def makeConfidenceIntervalEstimation(self, df, col, tag=False, rate=0.3):
        """
        tag表示是否是单边检测,还是中心检测,如果为True,且是单边检测,检测数据比率小于organic数据,可信度为1。默认是False
        """
        dfData = df.fillna(0).reset_index().values
        u0 = [i[2] * 1.0 / i[1] for i in dfData if i[0] == 'Organic'][0]
        dfList = [[i[0], i[1], u0, i[2] * 1.0 / i[1], tag, rate]
                  for i in dfData]
        result = map(self.confidenceIntervalEstimation, dfList)
        dfResult = self.makeDF(result, True, col)

        return dfResult

    def rateEstimation(self, l):
        product, media_source, n, u0, u, tag, rate = l
        if u0 == 0:
            u0 = 0.01

        if u >= u0:
            if tag:
                return (product, media_source, 1.0)
            else:
                return (product, media_source, max((2 * u0 - u) / (u0), 0.1))
        else:
            return (product, media_source, u / (u0))

    def makeConfidenceRate(self, df, col, tag=False, rate=0.3):
        """
        tag表示是否是单边检测,还是中心检测,如果为True,且是单边检测,检测数据比率小于organic数据,可信度为1。默认是False
        """
        dfData = df.fillna(0).reset_index().values

        product_set = set([i[0] for i in dfData])

        pdf = pd.DataFrame()
        for p in product_set:
            u0 = [
                i[3] * 1.0 / i[2] for i in dfData
                if i[1] == 'Organic' and i[0] == p
            ][0]

            dfList = [[i[0], i[1], i[2], u0, i[3] * 1.0 / i[2], tag, rate]
                      for i in dfData if i[0] == p]

            result = map(self.rateEstimation, dfList)

            dfResult = self.makeDF(result, True, col)
            pdf = pd.concat([pdf, dfResult], axis=0)

        return pdf

    def saveToMysql(self, df):
        data = df.fillna(0).reset_index().values
        print data
        sql = "insert into active_fraud_analysis	(product_id,channel,day,install,cnt_ip,ip_rate,cnt_net,net_rate,cnt_imsi,imsi_rate,cnt_imei,imei_rate,cnt_hour,hour_rate,cnt_device,device_rate) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE install=values(install),cnt_ip=values(cnt_ip),ip_rate=values(ip_rate),cnt_net=values(cnt_net),net_rate=values(net_rate),cnt_imsi=values(cnt_imsi),imsi_rate=values(imsi_rate),cnt_imei=values(cnt_imei),imei_rate=values(imei_rate),cnt_hour=values(cnt_hour),hour_rate=values(hour_rate),cnt_device=values(cnt_device),device_rate=values(device_rate)"

        #print data
        if len(data[0]) == 16:
            values = [(i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7], i[8],
                       i[9], i[10], i[11], i[12], i[13], i[14], i[15])
                      for i in data]
        else:
            values = [(i[0][0], i[0][1], i[1], i[2], i[3], i[4], i[5], i[6],
                       i[7], i[8], i[9], i[10], i[11], i[12], i[13], i[14])
                      for i in data]

        self.mysql.insertMany(sql, values)

    def analysis(self):

        print 'BEGIN'
        rawData = self.analysisModel(self.sql_channel_date, False,
                                     ['key_p', 'key_m', 'datetime'])  # 日期
        print 'rawData'
        rawData = rawData.groupby(['key_p', 'key_m']).max()

        rawChannel = self.analysisModel(self.sql_channel_cnt, False,
                                        ['key_p', 'key_m', 'channel'])  #总数
        print 'rawChannel'
        rawChannel = rawChannel.groupby(['key_p', 'key_m']).max()

        rawIP = self.analysisModel(self.sql_channel_ip_cnt, False,
                                   ['key_p', 'key_m', 'cnt_IP'])  #IP重复分布
        rawIP = rawIP.groupby(['key_p', 'key_m']).max()
        print 'rawIP'

        rawNetwork = self.analysisModel(self.sql_channel_wifi_cnt, False,
                                        ['key_p', 'key_m', 'cnt_Net'])  #网络分布
        rawNetwork = rawNetwork.groupby(['key_p', 'key_m']).max()
        print 'rawNetwork'
        #rawIPseg = self.analysisModel(self.sql_channel_IPseg_cnt, True, ['cnt_IPseg'])    #IP段分布
        rawImsi = self.analysisModel(self.sql_channel_imsi_cnt, False,
                                     ['key_p', 'key_m', 'cnt_IMSI'])  #IMSI为空比率
        print 'rawImsi'
        rawImsi = rawImsi.groupby(['key_p', 'key_m']).max()

        rawImei = self.analysisModel(self.sql_channel_imei_cnt, False,
                                     ['key_p', 'key_m', 'cnt_IMEI'])  #IMEI为空比率
        rawImei = rawImei.groupby(['key_p', 'key_m']).max()

        rawHour = self.analysisHour()  #时间分布

        rawDevice = self.analysisDevice()  #设备分布

        device = pd.concat([rawChannel, rawDevice], axis=1)

        #dfDevice = self.makeConfidenceIntervalEstimation(device, ['device_rate'],rate=0.5)
        dfDevice = self.makeConfidenceRate(device, ['device_rate'], rate=0.5)
        ip = pd.concat([rawChannel, rawIP], axis=1)

        dfIP = self.makeConfidenceRate(ip, ['ip_rate'], True)

        netWork = pd.concat([rawChannel, rawNetwork], axis=1)
        dfNetwork = self.makeConfidenceRate(netWork, ['network_rate'],
                                            rate=0.3)

        #IPseg = pd.concat([rawChannel, rawIPseg], axis=1)
        #dfIPseg = self.makeConfidenceIntervalEstimation(IPseg, ['ipseg_rate'])

        hour = pd.concat([rawChannel, rawHour], axis=1)

        #dfHour = self.makeConfidenceIntervalEstimation(hour, ['hour_rate'], rate=0.5)
        dfHour = self.makeConfidenceRate(hour, ['hour_rate'], rate=0.5)

        #IpImsi = pd.concat([rawChannel, rawIpImsi], axis=1)
        #dfIpImsi = self.makeConfidenceIntervalEstimation(IpImsi, ['IpImsi_rate'])

        imsi = pd.concat([rawChannel, rawImsi], axis=1)

        dfImsi = self.makeConfidenceRate(imsi, ['imsi_rate'], rate=0.3)

        imei = pd.concat([rawChannel, rawImei], axis=1)
        dfImei = self.makeConfidenceRate(imei, ['imei_rate'], rate=0.3)

        result = pd.concat([
            rawData, rawChannel, rawIP, dfIP, rawNetwork, dfNetwork, rawImsi,
            dfImsi, rawImei, dfImei, rawHour, dfHour, rawDevice, dfDevice
        ],
                           axis=1)
        print result
        print 'result'
        self.saveToMysql(result)

        self.insertRetain()
        self.insertAvgReturn7day()
        self.insertLoyaltyUser()

        self.insertScore()
        self.modifyScore()
Beispiel #4
0
class ActiveFraud(object):
    """docstring for RedShift"""
    def __init__(self, ptid=None, begin=None, end=None):
        #读取配置文件
        self.reader = gd.Reader('GeoIP2-City.mmdb')
        self.cf = ConfigParser.ConfigParser()
        self.cf.read('imsiConfig2.conf')

        self.redshift = RedShift()
        self.mysql = Mysql()

        #初始化开始结束日期,初始化产品id
        self.today = end if end else datetime.date.today().strftime("%Y-%m-%d")
        self.yesterday = begin if begin else (
            datetime.date.today() -
            datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        self.before_yesterday = (
            datetime.datetime.strptime(str(self.yesterday), "%Y-%m-%d") +
            datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
        self.ptid = ptid if ptid else '600025'
        print self.today
        print self.yesterday
        print self.ptid

        self.day_yesterday = self.yesterday.replace('-', '')  #20100101
        self.day_before_yesterday = self.getPlusDay(self.yesterday,
                                                    -1)  #20100101
        self.day_7day_ago = self.getPlusDay(self.yesterday, -7)  #20100101

        #构建数据查询sql
        self.sql_channel_cnt = "select media_source,count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " group by media_source;"
        self.sql_channel_ip_cnt = "select media_source,sum(cnt) from (select media_source,ip,count(distinct token) as cnt from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " group by media_source,ip having count(distinct token)>=5) group by media_source;"
        self.sql_channel_ip_imsi = "select media_source,country_iso,city,carrier,ip,split_part(imsi,'_',2) as imsi from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + ";"
        self.sql_channel_imsi_cnt = "select media_source,sum(case when lenimsi=15 then 1 else 0 end) as cnt from (select distinct media_source,token,len(split_part(imsi,'_',2)) as lenimsi from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + ") t group by media_source;"
        self.sql_channel_imei_cnt = "select media_source,sum(case when lenimei=15 then 1 else 0 end) as cnt from (select distinct media_source,token,len(imei) as lenimei from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + ") t group by media_source;"
        self.sql_channel_hour_cnt = "select media_source,date_part(hour, install_time),count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " group by media_source,date_part(hour, install_time);"
        self.sql_channel_wifi_cnt = "select media_source,sum(case when wifi='t' then cnt else 0 end) as cnt from (select media_source,wifi,count(distinct token) as cnt from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " group by media_source,wifi) t group by media_source;"
        self.sql_channel_ver_cnt = "select media_source,ver,count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " group by media_source,ver;"
        self.sql_channel_device_cnt = "select media_source,device_brand,count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " group by media_source,device_brand;"
        self.sql_channel_IPseg_cnt = "select media_source,sum(cnt) from (select media_source,ip,count(token) as cnt from (select distinct media_source,token,split_part(ip,'.',1)||'.'||split_part(ip,'.',2)||'.'||split_part(ip,'.',3) as ip from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + ") t group by media_source,ip having count(token)>=5) t2 group by media_source;"
        self.sql_channel_date = "select distinct media_source,'" + self.yesterday + "','" + self.ptid + "' from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + ";"
        #构建t分布预测模型
        y = np.array([0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.98])
        x = np.array([0.674, 0.842, 1.036, 1.282, 1.645, 1.960, 2.326])
        x = x.reshape(x.shape[0], 1)
        X = self.buildComplexDataset(x)

        alpha = 0.001
        l1_ratio = 0.001
        enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
        self.model = enet.fit(X, y)

    def getPlusDay(self, day, plus):
        date = datetime.datetime.strptime(
            str(day), "%Y-%m-%d") + datetime.timedelta(days=plus)
        string = date.strftime("%Y%m%d")
        return string

    def insertLoyaltyUser(self, ):
        if self.ptid == '600025':
            loyalty_key = '600025_T05'
        elif self.ptid == '600020':
            loyalty_key = 'T01'
        elif self.ptid == '600022':
            loyalty_key = 'P01'
        else:
            return 0

        sql_loyalty = "select day,media_source,product_id,sum(case when key_cnt>=3 then 1 else 0 end) as cnt,sum(case when key_cnt>=3 then 1 else 0 end)*1.0/count(*) as rate from (select distinct trunc(install_time) as day,media_source,product_id,token from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + ") t1 left outer join (select token,count(*) as cnt,count(case when sta_key='" + loyalty_key + "' then 1 else null end) as key_cnt from sta_event_game_publish where server_time>='" + self.yesterday + "' and server_time<'" + self.today + "' and product_id=" + self.ptid + " group by token) t2 on t1.token=t2.token group by day,media_source,product_id"

        sql_insert = "insert into active_fraud_analysis(day,channel,product_id,loyalty_users,loyalty_users_rate) values(%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE loyalty_users=values(loyalty_users),loyalty_users_rate=values(loyalty_users_rate)"

        raw = self.redshift.getAll(sql_loyalty)
        print sql_loyalty
        self.mysql.insertMany(sql_insert, raw)

    def insertRetain(self):
        """
        把隔日留存数据插进去
        """
        sql = "INSERT INTO active_fraud_analysis (day,channel,product_id,retain) select * from (select str_to_date(t1.date, '%Y%m%d') as day,t1.pid,t1.pdtid,(case when t2.cnt is null then 0 else t2.cnt end)*1.0/t1.cnt as rate from (select date,pid,pdtid,sum(match_num) cnt from ROI_install where pdtid=" + self.ptid + " and date='" + self.day_before_yesterday + "' group by pid,pdtid) t1 left outer join (select date,pid,pdtid,retain_day,sum(spec_retain) cnt from ROI_Retained where pdtid=" + self.ptid + " and date='" + self.day_before_yesterday + "' and retain_day='" + self.day_yesterday + "'  group by pid) t2 on t1.date=t2.date and t1.pid=t2.pid) t on DUPLICATE KEY UPDATE retain=rate;"
        #print sql
        self.mysql.insertOne(sql, None)

    def insertAvgReturn7day(self):
        sql = "INSERT INTO active_fraud_analysis (day,channel,product_id,avg_return_7day) select * from (select str_to_date(t1.date, '%Y%m%d') as day,t1.pid,t1.pdtid,t2.cnt/(datediff(str_to_date('" + self.day_yesterday + "','%Y%m%d'),str_to_date(t1.date, '%Y%m%d'))*t1.cnt) rate from (select date,pid,pdtid,sum(match_num) cnt from ROI_install where pdtid=" + self.ptid + " and date>='" + self.day_7day_ago + "' and date<'" + self.day_yesterday + "' group by pid,pdtid,date) t1 left outer join (select date,pid,pdtid,sum(cnt) as cnt from (select date,pid,pdtid,retain_day,sum(spec_retain) cnt from ROI_Retained where pdtid=" + self.ptid + " and date>='" + self.day_7day_ago + "' and retain_day>date and retain_day<='" + self.day_yesterday + "' group by pid,retain_day,date) t group by date,pid,pdtid) t2 on t1.date=t2.date and t1.pid=t2.pid) t3 on DUPLICATE KEY UPDATE avg_return_7day=rate;"

        self.mysql.insertOne(sql, None)

    def insertScore(self):
        #sql = "insert into active_fraud_analysis(day,channel,product_id,score) select * from (select day,channel,product_id,log(loyalty_users_rate*100+1.0)*log(retain*100+1.0)*log(avg_return_7day*100+1.0)*log(ip_rate*net_rate*100+1.0)*log(imsi_rate*100+1.0)*log(imei_rate*100+1.0)*log(hour_rate*100+1.0)*log(device_rate*100+1.001) as score from  active_fraud_analysis where day>='2017-02-18' and day<'2017-02-23') t ON DUPLICATE KEY UPDATE score=t.score;"
        #print sql
        #self.mysql.insertOne(sql,None)

        sql_new = "select day,channel,product_id,ifnull(loyalty_users_rate,0),ifnull(retain,0),ifnull(avg_return_7day,0),ifnull(net_rate,0),ifnull(imsi_rate,0),ifnull(imei_rate,0),ifnull(hour_rate,0),ifnull(device_rate,0) from active_fraud_analysis where day>='" + self.before_yesterday + "' and day<'" + self.today + "'"
        sql_insert = "insert into active_fraud_analysis(day,channel,product_id,score) values(%s,%s,%s,%s) ON DUPLICATE KEY UPDATE score=values(score)"

        raw = self.mysql.getAll(sql_new)

        for row in raw:
            print row[3:6]
            value = [row[i] for i in range(3)]
            rate = [row[i] for i in range(6, len(row))]

            scoreList = self.sigmoid(np.array(rate), True)
            score = reduce(lambda x, y: x * y, scoreList) * 2**(
                len(row) - 3) / 15 * 100 * row[3] * row[3] * row[4] * row[5]

            value.extend([score])
            self.mysql.insertOne(sql_insert, value)

        return raw

    def sigmoid(self, X, useStatus):
        """
        归一化处理,把比率数据归一化到[0.5,1]
        """
        if useStatus:
            return 1.0 / (1 + np.exp(-(X)))
        else:
            return X

    def makeDF(self, dt, ind=False, col=['key', 'cnt']):
        """
        数据以列表形式保存,其中的元素为元祖  [('a',1),('b',2)]
        """
        if ind:
            indexs = [i[0] for i in dt]
            data = [i[1:] for i in dt]
            df = pd.DataFrame(data, index=indexs, columns=col)
        else:
            df = pd.DataFrame(dt, columns=col)
        return df

    def analysisIP_IMSI(self):
        """
        分析IP和IMSI的一致性,获取原始数据
        """
        raw = self.redshift.getAll(self.sql_channel_ip_imsi)
        result = map(self.judgeIP_IMSI, raw)
        a = [i for i in result if i[1] == 0]
        b = [i for i in result if i[1] == 1]
        dfa = self.makeDF(a, False, ['key', 'cnt_IPIMSI'])
        c = dfa.groupby('key').count()
        #dfb = self.makeDF(b, False, ['key','cntIPIMSI1'])
        #d = dfb.groupby('key').count()
        df = pd.concat([c], axis=1)

        return df

    def judgeIP_IMSI(self, l):
        """
        分析IMSI MCC和MNC白名单,验证其和IP国家的一致性。
        """
        media_source, country_iso, city, carrier, ip, imsi = l
        if not imsi or len(imsi) < 15:
            return (media_source, 0)
        try:
            response = self.reader.city(ip)
            #英文名
            ipiso = response.country.iso_code  #TW
            mcc = imsi[0:3]
            if self.cf.has_option('DB', mcc):
                value = self.cf.get('DB', mcc)
                countryList = []
                mncList = value.split(',')[1:]
                if imsi[3:5] in mncList or imsi[3:6] in mncList:
                    countryList = value.split(',')[0].split('#')
                    if country_iso == ipiso and ipiso in countryList:
                        return (media_source, 1)
        except Exception as err:
            print err
            return (media_source, 0)
        return (media_source, 0)

    def buildComplexDataset(self, x):
        temp = 1.0 / x
        X = np.hstack((x, temp))
        return X

    def confidenceIntervalEstimation(self, l):
        """
        根据待评估数据的结果,评估其距organic的距离,计算器可信度。
        """
        media_source, n, u0, u, tag, rate = l
        if n < 50:
            #print [media_source, None]
            return [media_source, 0.01]
        if tag and u <= u0:
            return [media_source, 1.0]
        if u0 == u:
            return [media_source, 1.0]
        sigm = (u0 * (1 - u0))**rate
        se = sigm / (n**rate)

        x = abs(u - u0) / (se + random.random() / 10)
        #print x

        X_predict = np.array([x])
        X_predict = X_predict.reshape(X_predict.shape[0], 1)
        X_predict = self.buildComplexDataset(X_predict)
        #print X_predict
        pred = self.model.predict(X_predict)[0]
        result = min(1, max(0.01, 1 - pred))

        #print [media_source, result]
        return [media_source, result]

    def analysisHour(self):
        """
        统计各渠道各小时的激活数目,以organic为基准,计算人数最多的三个小时占比,统计其他渠道对应3小时的比例,用置信区间95%来衡量渠道的真实性。
        """
        raw = self.redshift.getAll(self.sql_channel_hour_cnt)
        #排序选出最大的三个小时
        #print raw
        #print self.sql_channel_hour_cnt
        rawOrganic = [i for i in raw if i[0] == 'Organic']
        rawOrganic.sort(key=lambda x: -x[2])

        organicMost = [rawOrganic[i][1] for i in range(3)]
        #分别求总数和前三个月聚合累加
        raw3most = [(i[0], i[2]) for i in raw if i[1] in organicMost]
        rawAll = [(i[0], i[2]) for i in raw]

        df3most = self.makeDF(raw3most, False, ['key1', 'cnt_hour'])
        df3mostSum = df3most.groupby('key1').sum()
        #dfAll = self.makeDF(rawAll, False, ['key2', 'cnt_hour_all'])
        #dfAllSum = dfAll.groupby('key2').sum()

        #计算出联合dataframe
        result = pd.concat([df3mostSum], axis=1)
        return result

    def analysisDevice(self):
        """
        分析设备分布
        """
        raw = self.redshift.getAll(self.sql_channel_device_cnt)
        rawOrganic = [i for i in raw if i[0] == 'Organic']
        rawOrganic.sort(key=lambda x: -x[2])
        organicMost = [rawOrganic[i][1] for i in range(3)]
        #分别求总数和前三个月聚合累加
        raw3most = [(i[0], i[2]) for i in raw if i[1] in organicMost]
        rawAll = [(i[0], i[2]) for i in raw]

        df3most = self.makeDF(raw3most, False, ['key1', 'cnt_device'])
        df3mostSum = df3most.groupby('key1').sum()
        #dfAll = self.makeDF(rawAll, False, ['key2', 'cnt_device_all'])
        #dfAllSum = dfAll.groupby('key2').sum()

        #计算出联合dataframe
        result = pd.concat([df3mostSum], axis=1)
        return result

    def analysisModel(self, sql, index, col):
        #print sql
        raw = self.redshift.getAll(sql)
        df = self.makeDF(raw, index, col)
        return df

    def makeConfidenceIntervalEstimation(self, df, col, tag=False, rate=0.3):
        """
        tag表示是否是单边检测,还是中心检测,如果为True,且是单边检测,检测数据比率小于organic数据,可信度为1。默认是False
        """
        dfData = df.fillna(0).reset_index().values
        u0 = [i[2] * 1.0 / i[1] for i in dfData if i[0] == 'Organic'][0]
        dfList = [[i[0], i[1], u0, i[2] * 1.0 / i[1], tag, rate]
                  for i in dfData]
        result = map(af.confidenceIntervalEstimation, dfList)
        dfResult = self.makeDF(result, True, col)
        return dfResult

    def saveToMysql(self, df):
        data = df.fillna(0).reset_index().values
        sql = "insert into active_fraud_analysis(channel,day,product_id,install,cnt_ip,ip_rate,cnt_net,net_rate,cnt_imsi,imsi_rate,cnt_imei,imei_rate,cnt_hour,hour_rate,cnt_device,device_rate) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE install=values(install),cnt_ip=values(cnt_ip),ip_rate=values(ip_rate),cnt_net=values(cnt_net),net_rate=values(net_rate),cnt_imsi=values(cnt_imsi),imsi_rate=values(imsi_rate),cnt_imei=values(cnt_imei),imei_rate=values(imei_rate),cnt_hour=values(cnt_hour),hour_rate=values(hour_rate),cnt_device=values(cnt_device),device_rate=values(device_rate)"

        #print data
        values = [(i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7], i[8], i[9],
                   i[10], i[11], i[12], i[13], i[14], i[15]) for i in data]
        print "values"
        print values
        self.mysql.insertMany(sql, values)
        #from sqlalchemy import create_engine

        ##将数据写入mysql的数据库,但需要先通过sqlalchemy.create_engine建立连接,且字符编码设置为utf8,否则有些latin字符不能处理
        #yconnect = create_engine('mysql+mysqldb://datauser:ebOUeWSin3hAAKKD@test-dataverse-web.c0poh9vgjxya.rds.cn-north-1.amazonaws.com.cn:3306/dataverse?charset=utf8')

        #pd.io.sql.to_sql(df, 'zhx_test', yconnect, flavor='mysql', schema='dataverse', if_exists='append')
        #pd.io.sql.to_sql(rawChannel,'test_zhx', yconnect, flavor='mysql', schema='dataverse', if_exists='append')

    def analysis(self):

        print 'BEGIN'
        rawData = self.analysisModel(self.sql_channel_date, True,
                                     ['datetime', 'product_id'])  # 日期
        rawChannel = self.analysisModel(self.sql_channel_cnt, True,
                                        ['channel'])  #总数
        rawIP = self.analysisModel(self.sql_channel_ip_cnt, True,
                                   ['cnt_IP'])  #IP重复分布
        rawNetwork = self.analysisModel(self.sql_channel_wifi_cnt, True,
                                        ['cnt_Net'])  #网络分布
        #rawIPseg = self.analysisModel(self.sql_channel_IPseg_cnt, True, ['cnt_IPseg'])    #IP段分布
        rawImsi = self.analysisModel(self.sql_channel_imsi_cnt, True,
                                     ['cnt_IMSI'])  #IMSI为空比率
        rawImei = self.analysisModel(self.sql_channel_imei_cnt, True,
                                     ['cnt_IMEI'])  #IMEI为空比率
        #rawIpImsi = self.analysisIP_IMSI()  #IP、IMSI异常分布
        #print rawIpImsi
        rawHour = self.analysisHour()  #时间分布
        rawDevice = self.analysisDevice()  #设备分布

        device = pd.concat([rawChannel, rawDevice], axis=1)
        dfDevice = self.makeConfidenceIntervalEstimation(device,
                                                         ['device_rate'],
                                                         rate=0.5)

        ip = pd.concat([rawChannel, rawIP], axis=1)
        dfIP = self.makeConfidenceIntervalEstimation(ip, ['ip_rate'], True)

        netWork = pd.concat([rawChannel, rawNetwork], axis=1)
        dfNetwork = self.makeConfidenceIntervalEstimation(netWork,
                                                          ['network_rate'],
                                                          rate=0.3)

        #IPseg = pd.concat([rawChannel, rawIPseg], axis=1)
        #dfIPseg = self.makeConfidenceIntervalEstimation(IPseg, ['ipseg_rate'])

        hour = pd.concat([rawChannel, rawHour], axis=1)
        dfHour = self.makeConfidenceIntervalEstimation(hour, ['hour_rate'],
                                                       rate=0.5)

        #IpImsi = pd.concat([rawChannel, rawIpImsi], axis=1)
        #dfIpImsi = self.makeConfidenceIntervalEstimation(IpImsi, ['IpImsi_rate'])

        imsi = pd.concat([rawChannel, rawImsi], axis=1)
        dfImsi = self.makeConfidenceIntervalEstimation(imsi, ['imsi_rate'],
                                                       rate=0.3)

        imei = pd.concat([rawChannel, rawImei], axis=1)
        dfImei = self.makeConfidenceIntervalEstimation(imei, ['imei_rate'],
                                                       rate=0.3)

        result = pd.concat([
            rawData, rawChannel, rawIP, dfIP, rawNetwork, dfNetwork, rawImsi,
            dfImsi, rawImei, dfImei, rawHour, dfHour, rawDevice, dfDevice
        ],
                           axis=1)
        self.saveToMysql(result)
        self.insertRetain()
        self.insertAvgReturn7day()
        self.insertLoyaltyUser()
        print result

        #new test
        result = self.insertScore()
        return result
Beispiel #5
0
# In[1]:

import numpy as np
import pandas as pd

import os
import time
import datetime

from redshiftpool import RedShift
from mysqlpool import Mysql


# In[3]:

redshift = RedShift()
#mysql = Mysql()


# In[4]:

#sql_train_neg = "select c.token,to_char(client_time,'YYYY-MM-DD HH24:MI:SS'),c.sta_key,to_char(create_time,'YYYY-MM-DD HH24:MI:SS') from (select b.* from (select token from sta_new_user where server_time>='2017-06-01' and server_time<'2017-06-06' and product_id=600027) a ,(select * from (select token,client_time,sta_key,ROW_NUMBER () OVER (PARTITION BY token order by client_time asc) as row from sta_event_game_publish where server_time>='2017-06-01' and server_time<'2017-06-11' and product_id=600027 and substring(sta_key,1,1)='T') t where row<=30) b where a.token=b.token) c left join (select token,min(create_time) as create_time from game_iap where create_time>='2017-06-01' and product_id=600027 group by token)d on c.token=d.token where d.token is null;"

sql_train_neg = "select c.token,to_char(client_time,'YYYY-MM-DD HH24:MI:SS'),c.sta_key,to_char(create_time,'YYYY-MM-DD HH24:MI:SS') from (select b.* from (select token from sta_new_user where server_time>='2017-07-01' and server_time<'2017-07-06' and product_id=600027) a ,(select * from (select token,client_time,sta_key,ROW_NUMBER () OVER (PARTITION BY token order by client_time asc) as row from sta_event_game_publish where server_time>='2017-07-01' and server_time<'2017-07-11' and product_id=600027 and substring(sta_key,1,1)='T') t where row<=30) b where a.token=b.token) c left join (select token,min(create_time) as create_time from game_iap where create_time>='2017-07-01' and product_id=600027 group by token)d on c.token=d.token where d.token is null;"


sql_train_pos = "select c.token,to_char(client_time,'YYYY-MM-DD HH24:MI:SS'),c.sta_key,to_char(create_time,'YYYY-MM-DD HH24:MI:SS') from (select b.* from (select token from sta_new_user where server_time>='2017-03-01' and product_id=600027) a ,(select * from (select token,client_time,sta_key,ROW_NUMBER () OVER (PARTITION BY token order by client_time asc) as row from sta_event_game_publish where server_time>='2017-03-01' and product_id=600027 and substring(sta_key,1,1)='T') t where row<=30) b where a.token=b.token) c join (select token,min(create_time) as create_time from game_iap where create_time>='2017-04-01' and product_id=600027 group by token)d on c.token=d.token;"



# In[5]: