def __init__(self, ptid=None, begin=None, end=None): #读取配置文件 self.reader = gd.Reader('GeoIP2-City.mmdb') self.cf = ConfigParser.ConfigParser() self.cf.read('imsiConfig2.conf') self.redshift = RedShift() self.mysql = Mysql() #初始化开始结束日期,初始化产品id self.today = end if end else datetime.date.today().strftime("%Y-%m-%d") self.yesterday = begin if begin else ( datetime.date.today() - datetime.timedelta(days=1)).strftime("%Y-%m-%d") self.before_yesterday = ( datetime.datetime.strptime(str(self.yesterday), "%Y-%m-%d") + datetime.timedelta(days=-1)).strftime("%Y-%m-%d") self.date_7day_ago = ( datetime.datetime.strptime(str(self.yesterday), "%Y-%m-%d") + datetime.timedelta(days=-7)).strftime("%Y-%m-%d") self.ptid = ptid if ptid else '600025' print self.today print self.yesterday print self.ptid self.day_yesterday = self.yesterday.replace('-', '') #20100101 self.day_before_yesterday = self.getPlusDay(self.yesterday, -1) #20100101 self.day_7day_ago = self.getPlusDay(self.yesterday, -7) #20100101 #构建数据查询sql self.sql_channel_cnt = "select media_source,count(distinct token) from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null group by media_source;" self.sql_channel_ip_cnt = "select media_source,sum(cnt) from (select media_source,ip,count(distinct token) as cnt from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null group by media_source,ip having count(distinct token)>=5) group by media_source;" self.sql_channel_ip_imsi = "select media_source,country_iso,city,carrier,ip,split_part(imsi,'_',2) as imsi from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null ;" self.sql_channel_imsi_cnt = "select media_source,sum(case when lenimsi=15 then 1 else 0 end) as cnt from (select distinct media_source,token,len(split_part(imsi,'_',2)) as lenimsi from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null ) t group by media_source;" self.sql_channel_imei_cnt = "select media_source,sum(case when lenimei=15 then 1 else 0 end) as cnt from (select distinct media_source,token,len(imei) as lenimei from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null ) t group by media_source;" self.sql_channel_hour_cnt = "select media_source,date_part(hour, install_time),count(distinct token) from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null group by media_source,date_part(hour, install_time);" self.sql_channel_wifi_cnt = "select media_source,sum(case when wifi='t' then cnt else 0 end) as cnt from (select media_source,wifi,count(distinct token) as cnt from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null group by media_source,wifi) t group by media_source;" self.sql_channel_ver_cnt = "select media_source,ver,count(distinct token) from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null group by media_source,ver;" self.sql_channel_device_cnt = "select media_source,device_brand,count(distinct token) from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null group by media_source,device_brand;" self.sql_channel_IPseg_cnt = "select media_source,sum(cnt) from (select media_source,ip,count(token) as cnt from (select distinct media_source,token,split_part(ip,'.',1)||'.'||split_part(ip,'.',2)||'.'||split_part(ip,'.',3) as ip from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null ) t group by media_source,ip having count(token)>=5) t2 group by media_source;" self.sql_channel_date = "select distinct media_source,'" + self.yesterday + "','" + self.ptid + "' from view_appsflyer_postback_ios_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null;" #构建t分布预测模型 y = np.array([0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.98]) x = np.array([0.674, 0.842, 1.036, 1.282, 1.645, 1.960, 2.326]) x = x.reshape(x.shape[0], 1) X = self.buildComplexDataset(x) alpha = 0.001 l1_ratio = 0.001 enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) self.model = enet.fit(X, y)
def __init__(self, today=None): #读取配置文件 self.redshift = RedShift() self.mysql = Mysql() #初始化开始结束日期,初始化产品id self.today = today if today else datetime.date.today().strftime("%Y-%m-%d") self.yesterday = (datetime.datetime.strptime(str(self.today),"%Y-%m-%d") + datetime.timedelta(days=-1)).strftime("%Y-%m-%d") self.day_before_yesterday = (datetime.datetime.strptime(str(self.today),"%Y-%m-%d") + datetime.timedelta(days=-2)).strftime("%Y-%m-%d") print self.today print self.yesterday self.yesterday_day = self.yesterday.replace('-','') self.begin_date = '2017-06-14' self.begin_day = self.begin_date.replace('-','') self.spec_cond = self.getSpecCondition() #构建数据查询sql self.purchase_sql = "select date,revenue_day,product_id,classify_group,classify,country_iso,platform,media_source,c_id,c_name,count(distinct token) as pay_user,sum(revenue) as purchase from (select replace(created_time,'-','') as date,replace(log_date,'-','') as revenue_day,classify_group,classify,country_iso,product_id,platform,token,media_source,nvl(c_id,'') as c_id,nvl(nvl(f.campaign_name,campaign),'') as c_name,revenue from (select created_time,c.log_date,classify_group,classify,c.country_iso,c.product_id,c.platform,token,media_source,c_id,campaign,d.revenue from (select created_time,log_date,classify_group,classify,country_iso,product_id,platform,a.token,media_source,nvl(fb_campaign_id,af_c_id) as c_id,campaign from (select trunc(server_time) as log_date,country_iso,product_id,token,case when sta_value like '%ca-app-pub%' then 'admob' else 'fb' end platform from sta_event_game_publish where server_time>='"+self.day_before_yesterday+"' and server_time<'" + self.yesterday + "' and sta_key in ('_RDA_REWARDOK','_NEW_RDA_REWARDOK')) b , (select distinct created_time,classify_group,classify,t1.gaid,t2.token,media_source,fb_campaign_id,af_c_id,campaign from (select trunc(created_time) as created_time,substring(regexp_substr(grp,'_[^.]*'),len(regexp_substr(grp,'_[^.]*')),1) as classify,substring(regexp_substr(grp,'_[^.]*'),2,len(regexp_substr(grp,'_[^.]*'))-3) as classify_group,split_part(device_info,'#',1) as gaid,split_part(grp,'_',1) as productname from adsdk_abtest_info where created_time>='"+self.begin_date+"' and created_time<'" + self.yesterday + "' and length(split_part(device_info,'#',1))>10 ) t1 ,(select trunc(install_time) as install_time,token,product_id,advertising_id,media_source,fb_campaign_id,af_c_id,campaign from view_appsflyer_postback_android_install_token where install_time>='"+self.begin_date+"' and install_time<'" + self.yesterday + "' and advertising_id is not null) t2,product p where t1.created_time=t2.install_time and t1.gaid=t2.advertising_id and t1.productname=p.pkg and p.pid = t2.product_id) a where b.token=a.token) c left outer join (select log_date,country_iso,product_id,platform,revenue/cnt_all as revenue from (select log_date,country_iso,product_id,platform,count(*) as cnt_all from (select trunc(server_time) as log_date,country_iso,product_id,token,case when sta_value like '%ca-app-pub%' then 'admob' else 'fb' end platform from sta_event_game_publish where server_time>='"+self.day_before_yesterday+"' and server_time<'" + self.yesterday + "' and sta_key in ('_RDA_REWARDOK','_NEW_RDA_REWARDOK') )t group by log_date,country_iso,product_id,platform) a, (select pid,country,revenue,day,channel,ad_type from game_roi_ad_revenue where day='"+self.day_before_yesterday+"' and revenue>0) b where a.log_date=b.day and a.product_id=b.pid and a.country_iso=b.country and a.platform=b.channel) d on c.log_date=d.log_date and c.product_id=d.product_id and c.country_iso=d.country_iso and c.platform=d.platform) e left outer join (select date,campaign_id,campaign_name from view_googleadwords_int where date>=" + self.begin_day + " group by date,campaign_id,campaign_name) f on e.c_id=f.campaign_id and replace(e.created_time,'-','')=f.date) h group by date,revenue_day,classify_group,classify,country_iso,product_id,platform,media_source,c_id,c_name;" self.purchase_sql_insert = "insert into ROI_Purchase_ABtest(date,revenue_day,pdtid,classify_group,classify,country_iso,platform,pid,campaign_id,c,pay_user,purchase) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE pay_user=values(pay_user),purchase=values(purchase);" self.retain_sql = "select date,'"+self.yesterday_day+"',product_id,classify_group,classify,country_iso,media_source,c_id,campaign,retain_user from (select date,product_id,classify_group,classify,country_iso,media_source,c_id,campaign,count(distinct token) as retain_user from (select replace(created_time,'-','') as date,a.product_id,classify_group,classify,a.country_iso,media_source,nvl(c_id,'') as c_id,nvl(campaign,'') as campaign,b.token from (select created_time,classify_group,classify,product_id,country_iso,t1.gaid,t2.token,media_source,nvl(fb_campaign_id,af_c_id) as c_id,campaign from (select distinct trunc(created_time) as created_time,substring(regexp_substr(grp,'_[^.]*'),len(regexp_substr(grp,'_[^.]*')),1) as classify,substring(regexp_substr(grp,'_[^.]*'),2,len(regexp_substr(grp,'_[^.]*'))-3) as classify_group, split_part(device_info,'#',1) as gaid,split_part(grp,'_',1) as productname from adsdk_abtest_info where created_time>='"+self.begin_date+"' and created_time<'" + self.today + "' and length(split_part(device_info,'#',1))>10 ) t1, (select trunc(install_time) as install_time,token,product_id,country_iso,advertising_id,media_source,fb_campaign_id,af_c_id,campaign from view_appsflyer_postback_android_install_token where install_time>='"+self.begin_date+"' and install_time<'" + self.today + "' and advertising_id is not null)t2, product p where t1.created_time = t2.install_time and t1.gaid=t2.advertising_id and t1.productname=p.pkg and p.pid = t2.product_id) a left join (select distinct trunc(server_time) as server_time,country_iso,token,product_id from sta_event_game_publish where server_time>='" + self.yesterday + "' and server_time<'" + self.today + "' and (product_id=600020 and sta_key not in ('T0I','T0J','_CI') and sta_key not like '_NEW%')) b on a.token = b.token and a.product_id=b.product_id) c group by date,product_id,classify_group,classify,country_iso,media_source,c_id,campaign) d where retain_user>0;" self.retain_sql_insert = "insert into ROI_Retained_ABtest(date,retain_day,pdtid,classify_group,classify,country_iso,pid,campaign_id,c,spec_retain) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE spec_retain=values(spec_retain);" self.install_sql = "select a.date,product_id,classify_group,classify,country_iso,media_source,nvl(c_id,''),nvl(nvl(f.campaign_name,campaign),'') as c_name,install_num from (select replace(created_time,'-','') as date,product_id,classify_group,classify,country_iso,media_source,nvl(fb_campaign_id,af_c_id) as c_id,campaign,count(distinct t2.token) as install_num from (select distinct trunc(created_time) as created_time,substring(regexp_substr(grp,'_[^.]*'),len(regexp_substr(grp,'_[^.]*')),1) as classify,substring(regexp_substr(grp,'_[^.]*'),2,len(regexp_substr(grp,'_[^.]*'))-3) as classify_group,split_part(device_info,'#',1) as gaid,split_part(grp,'_',1) as productname from adsdk_abtest_info where created_time>='" + self.yesterday + "' and created_time<'" + self.today + "' and length(split_part(device_info,'#',1))>10 ) t1,(select trunc(install_time) as install_time,token,product_id,country_iso,advertising_id,media_source,fb_campaign_id,af_c_id,campaign from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and advertising_id is not null)t2,product p where t1.gaid=t2.advertising_id and t1.productname=p.pkg and p.pid = t2.product_id group by created_time,product_id,classify_group,classify,country_iso,media_source,nvl(fb_campaign_id,af_c_id),campaign) a left outer join (select date,campaign_id,campaign_name from view_googleadwords_int where date="+self.yesterday_day+" group by date,campaign_id,campaign_name) f on a.c_id=f.campaign_id and a.date=f.date;" self.install_sql_insert = "insert into ROI_install_ABtest(date,pdtid,classify_group,classify,country_iso,pid,campaign_id,c,install_num) values(%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE install_num=values(install_num)"
class ActiveFraud(object): """docstring for RedShift""" def __init__(self, begin=None, end=None): #读取配置文件 self.reader = gd.Reader('GeoIP2-City.mmdb') self.cf = ConfigParser.ConfigParser() self.cf.read('imsiConfig2.conf') self.redshift = RedShift() self.mysql = Mysql() #初始化开始结束日期,初始化产品id self.today = end if end else datetime.date.today().strftime("%Y-%m-%d") self.yesterday = begin if begin else ( datetime.date.today() - datetime.timedelta(days=1)).strftime("%Y-%m-%d") self.before_yesterday = ( datetime.datetime.strptime(str(self.yesterday), "%Y-%m-%d") + datetime.timedelta(days=-1)).strftime("%Y-%m-%d") self.date_7day_ago = ( datetime.datetime.strptime(str(self.yesterday), "%Y-%m-%d") + datetime.timedelta(days=-7)).strftime("%Y-%m-%d") self.ptid = '600025' print self.today print self.yesterday self.day_yesterday = self.yesterday.replace('-', '') #20100101 self.day_before_yesterday = self.getPlusDay(self.yesterday, -1) #20100101 self.day_7day_ago = self.getPlusDay(self.yesterday, -7) #20100101 #构建数据查询sql self.sql_channel_cnt = "select product_id,media_source,count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000 and token is not null group by product_id having(count(distinct media_source)>1)) and token is not null group by media_source,product_id;" self.sql_channel_ip_cnt = "select product_id,media_source,sum(cnt) from (select product_id,media_source,ip,count(distinct token) as cnt from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000 and token is not null group by product_id having(count(distinct media_source)>1)) and token is not null group by product_id,media_source,ip having count(distinct token)<5) group by product_id,media_source;" self.sql_channel_ip_imsi = "select media_source,country_iso,city,carrier,ip,split_part(imsi,'_',2) as imsi from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null ;" self.sql_channel_imsi_cnt = "select product_id,media_source,sum(case when lenimsi=15 then 1 else 0 end) as cnt from (select distinct product_id,media_source,token,len(split_part(imsi,'_',2)) as lenimsi from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000 and token is not null group by product_id having(count(distinct media_source)>1)) and token is not null ) t group by product_id,media_source;" self.sql_channel_imei_cnt = "select product_id,media_source,sum(case when lenimei=15 then 1 else 0 end) as cnt from (select distinct product_id,media_source,token,len(imei) as lenimei from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000 and token is not null group by product_id having(count(distinct media_source)>1)) and token is not null ) t group by product_id,media_source;" self.sql_channel_hour_cnt = "select product_id,media_source,date_part(hour, install_time),count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and token is not null and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000 and token is not null group by product_id having(count(distinct media_source)>1))group by product_id,media_source,date_part(hour, install_time);" self.sql_channel_wifi_cnt = "select product_id,media_source,sum(case when wifi='t' then cnt else 0 end) as cnt from (select product_id,media_source,wifi,count(distinct token) as cnt from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000 and token is not null group by product_id having(count(distinct media_source)>1)) and token is not null group by product_id,media_source,wifi) t group by product_id,media_source;" self.sql_channel_ver_cnt = "select media_source,ver,count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null group by media_source,ver;" self.sql_channel_device_cnt = "select product_id,media_source,device_brand,count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000 and token is not null group by product_id having(count(distinct media_source)>1)) and token is not null group by product_id,media_source,device_brand;" self.sql_channel_IPseg_cnt = "select media_source,sum(cnt) from (select media_source,ip,count(token) as cnt from (select distinct media_source,token,split_part(ip,'.',1)||'.'||split_part(ip,'.',2)||'.'||split_part(ip,'.',3) as ip from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " and token is not null ) t group by media_source,ip having count(token)>=5) t2 group by media_source;" self.sql_channel_date = "select distinct product_id,media_source,'" + self.yesterday + "' from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000 and token is not null and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000 and token is not null group by product_id having(count(distinct media_source)>1));" #构建t分布预测模型 y = np.array([0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.98]) x = np.array([0.674, 0.842, 1.036, 1.282, 1.645, 1.960, 2.326]) x = x.reshape(x.shape[0], 1) X = self.buildComplexDataset(x) alpha = 0.001 l1_ratio = 0.001 enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) self.model = enet.fit(X, y) def getPlusDay(self, day, plus): date = datetime.datetime.strptime( str(day), "%Y-%m-%d") + datetime.timedelta(days=plus) string = date.strftime("%Y%m%d") return string def getPlusDate(self, date, plus): return (datetime.datetime.strptime(str(date), "%Y-%m-%d") + datetime.timedelta(days=plus)).strftime("%Y-%m-%d") def insertLoyaltyUser(self, ): sql_loyalty = "select install_time,media_source,product_id,sum(uncheat) as loyuser,sum(uncheat)*1.0/count(*) as loyrate from (select install_time,product_id,token,media_source, case when cnt_all>=100 then 1 when cnt_bg*1.0/cnt_all>0.4 then 0 else 1 end as uncheat from (select install_time,product_id,token,media_source, sum(cnt_key) as cnt_all, sum(case t3.sta_key when '_' then cnt_key else 0 end) as cnt_bg from (select trunc(t1.install_time) as install_time,t1.product_id,t1.token,t1.media_source,t2.sta_key,count(*) as cnt_key from (select media_source,product_id,token,install_time from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000 and token is not null group by product_id having(count(distinct media_source)>1))) t1 left outer join (select product_id,token,substring(sta_key,1,1) as sta_key,server_time from sta_event_game_publish where product_id in (select product_id from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id>600000 and token is not null group by product_id having(count(distinct media_source)>1)) and server_time>='" + self.yesterday + "' and server_time<'" + self.today + "') t2 on t1.token=t2.token and t1.product_id=t2.product_id group by trunc(t1.install_time),t1.product_id,t1.token,t1.media_source,t2.sta_key) t3 group by install_time,product_id,token,media_source ) t4 ) t5 group by install_time,product_id,media_source;" #print sql_loyalty sql_insert = "insert into active_fraud_analysis(day,channel,product_id,loyalty_users,loyalty_users_rate) values(%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE loyalty_users=values(loyalty_users),loyalty_users_rate=values(loyalty_users_rate)" raw = self.redshift.getAll(sql_loyalty) self.mysql.insertMany(sql_insert, raw) def insertRetain(self): """ 把隔日留存数据插进去 """ sql = "INSERT INTO active_fraud_analysis (day,channel,product_id,retain) select * from (select str_to_date(t1.date, '%Y%m%d') as day,t1.pid,t1.pdtid,(case when t2.cnt is null then 0 else t2.cnt end)*1.0/t1.cnt as rate from (select date,pid,pdtid,sum(match_num) cnt from ROI_install where date='" + self.day_before_yesterday + "' and pdtid in (select pdtid from ROI_install where date='" + self.day_before_yesterday + "' and pdtid>600000 group by pdtid having(count(distinct pid)>1)) group by pid,pdtid) t1 left outer join (select date,pid,pdtid,retain_day,sum(spec_retain) cnt from ROI_Retained where date='" + self.day_before_yesterday + "' and retain_day='" + self.day_yesterday + "' and pdtid in (select pdtid from ROI_Retained where date='" + self.day_before_yesterday + "' and retain_day='" + self.day_yesterday + "' and pdtid>600000 group by pdtid having(count(distinct pid)>1)) group by pid,pdtid) t2 on t1.date=t2.date and t1.pid=t2.pid and t1.pdtid=t2.pdtid) t where rate is not null on DUPLICATE KEY UPDATE retain=rate;" #print sql self.mysql.insertOne(sql, None) def insertAvgReturn7day(self): sql = "INSERT INTO active_fraud_analysis (day,channel,product_id,avg_return_7day) select * from (select str_to_date(t1.date, '%Y%m%d') as day,t1.pid,t1.pdtid,t2.cnt/(datediff(str_to_date('" + self.day_yesterday + "','%Y%m%d'),str_to_date(t1.date, '%Y%m%d'))*t1.cnt) rate from (select date,pid,pdtid,sum(match_num) cnt from ROI_install where pdtid in (select pdtid from ROI_install where date>='" + self.day_7day_ago + "' and date<'" + self.day_yesterday + "' and pdtid>600000 group by pdtid having(count(distinct pid)>1)) and date>='" + self.day_7day_ago + "' and date<'" + self.day_yesterday + "' group by pid,pdtid,date) t1 left outer join (select date,pid,pdtid,sum(cnt) as cnt from (select date,pid,pdtid,retain_day,sum(spec_retain) cnt from ROI_Retained where pdtid in (select pdtid from ROI_Retained where date>='" + self.day_7day_ago + "' and retain_day>date and retain_day<='" + self.day_yesterday + "' and pdtid>600000 group by pdtid having(count(distinct pid)>1)) and date>='" + self.day_7day_ago + "' and retain_day>date and retain_day<='" + self.day_yesterday + "' group by pdtid,pid,retain_day,date) t group by date,pid,pdtid) t2 on t1.date=t2.date and t1.pid=t2.pid and t1.pdtid=t2.pdtid) t3 where rate is not null on DUPLICATE KEY UPDATE avg_return_7day=rate;" #print sql self.mysql.insertOne(sql, None) def insertScore(self): sql_new = "select day,channel,product_id,ifnull(loyalty_users_rate,0),ifnull(retain,0),ifnull(avg_return_7day,0),ifnull(net_rate,0),ifnull(imsi_rate,0),ifnull(imei_rate,0),ifnull(hour_rate,0),ifnull(device_rate,0) from active_fraud_analysis where install>50 and day>='" + self.date_7day_ago + "' and day<'" + self.today + "' ;" sql_insert = "insert into active_fraud_analysis(day,channel,product_id,score) values(%s,%s,%s,%s) ON DUPLICATE KEY UPDATE score=values(score)" raw = self.mysql.getAll(sql_new) if not raw: sys.exit() for row in raw: # 留存为空的不处理 if row[4] == 0.0: continue value = [row[i] for i in range(3)] rate = [float(row[i]) for i in range(3, len(row))] score = reduce(lambda x, y: x * y, np.array(rate)) * 10000 / (row[4] / row[5])**3 if row[1] == 'Organic': score = 100.0 value.extend([score]) self.mysql.insertOne(sql_insert, value) def modifyScore(self): sql = "update active_fraud_analysis a, (select t2.day,t2.channel,t2.product_id,score*90/benchmark as score from (select day,product_id,max(score) as benchmark from active_fraud_analysis where install>50 and channel<>'Organic' and score>0 and day>='" + self.date_7day_ago + "' group by day,product_id) t1,(select day,channel,product_id,score from active_fraud_analysis where install>50 and channel<>'Organic' and score>0 and day>='" + self.date_7day_ago + "') t2 where t1.day=t2.day and t1.product_id=t2.product_id) b set a.score=b.score where a.day=b.day and a.product_id=b.product_id and a.channel=b.channel;" #print sql self.mysql.update(sql) def sigmoid(self, X, useStatus): """ 归一化处理,把比率数据归一化到[0.5,1] """ if useStatus: return 1.0 / (1 + np.exp(-(X))) else: return X def makeDF(self, dt, ind=False, col=['key_p', 'key_m', 'cnt']): """ 数据以列表形式保存,其中的元素为元祖 [('a',1),('b',2)] """ if ind: indexs = [i[0:2] for i in dt] data = [i[2:] for i in dt] df = pd.DataFrame(data, index=indexs, columns=col) else: df = pd.DataFrame(dt, columns=col) return df def analysisIP_IMSI(self): """ 分析IP和IMSI的一致性,获取原始数据 """ raw = self.redshift.getAll(self.sql_channel_ip_imsi) result = map(self.judgeIP_IMSI, raw) a = [i for i in result if i[1] == 0] b = [i for i in result if i[1] == 1] dfa = self.makeDF(a, False, ['key', 'cnt_IPIMSI']) c = dfa.groupby('key').count() #dfb = self.makeDF(b, False, ['key','cntIPIMSI1']) #d = dfb.groupby('key').count() df = pd.concat([c], axis=1) return df def judgeIP_IMSI(self, l): """ 分析IMSI MCC和MNC白名单,验证其和IP国家的一致性。 """ media_source, country_iso, city, carrier, ip, imsi = l if not imsi or len(imsi) < 15: return (media_source, 0) try: response = self.reader.city(ip) #英文名 ipiso = response.country.iso_code #TW mcc = imsi[0:3] if self.cf.has_option('DB', mcc): value = self.cf.get('DB', mcc) countryList = [] mncList = value.split(',')[1:] if imsi[3:5] in mncList or imsi[3:6] in mncList: countryList = value.split(',')[0].split('#') if country_iso == ipiso and ipiso in countryList: return (media_source, 1) except Exception as err: print err return (media_source, 0) return (media_source, 0) def buildComplexDataset(self, x): temp = 1.0 / x X = np.hstack((x, temp)) return X def confidenceIntervalEstimation(self, l): """ 根据待评估数据的结果,评估其距organic的距离,计算器可信度。 """ media_source, n, u0, u, tag, rate = l if n < 50: #print [media_source, None] return [media_source, 0.01] if tag and u <= u0: return [media_source, 1.0] if u0 == u: return [media_source, 1.0] sigm = (u0 * (1 - u0))**rate se = sigm / (n**rate) x = abs(u - u0) / (se + random.random() / 10) X_predict = np.array([x]) X_predict = X_predict.reshape(X_predict.shape[0], 1) X_predict = self.buildComplexDataset(X_predict) pred = self.model.predict(X_predict)[0] result = min(1, max(0.01, 1 - pred)) #print [media_source, result] return [media_source, result] def analysisHour(self): """ 统计各渠道各小时的激活数目,以organic为基准,计算人数最多的三个小时占比,统计其他渠道对应3小时的比例,用置信区间95%来衡量渠道的真实性。 """ raw = self.redshift.getAll(self.sql_channel_hour_cnt) #排序选出最大的三个小时 if not raw: sys.exit(1) product_set = set([i[0] for i in raw]) result = pd.DataFrame() for p in product_set: rawOrganic = [i for i in raw if i[1] == 'Organic' and i[0] == p] rawOrganic.sort(key=lambda x: -x[3]) organicMost = [ rawOrganic[i][2] for i in range(min(3, len(rawOrganic))) ] #分别求总数和前三个月聚合累加 raw3most = [(i[0], i[1], i[3]) for i in raw if i[2] in organicMost and i[0] == p] #rawAll = [(i[1],i[3]) for i in raw] df3most = self.makeDF(raw3most, False, ['key_p', 'key_m', 'cnt_hour']) df3mostSum = df3most.groupby(['key_p', 'key_m']).sum() #print df3mostSum #print "df3mostSum" #dfAll = self.makeDF(rawAll, False, ['key2', 'cnt_hour_all']) #dfAllSum = dfAll.groupby('key2').sum() result = pd.concat([result, df3mostSum], axis=0) return result def analysisDevice(self): """ 分析设备分布 """ raw = self.redshift.getAll(self.sql_channel_device_cnt) if not raw: sys.exit(1) product_set = set([i[0] for i in raw]) result = pd.DataFrame() for p in product_set: rawOrganic = [i for i in raw if i[1] == 'Organic' and i[0] == p] rawOrganic.sort(key=lambda x: -x[3]) organicMost = [ rawOrganic[i][2] for i in range(min(3, len(rawOrganic))) ] #分别求总数和前三个月聚合累加 raw3most = [(i[0], i[1], i[3]) for i in raw if i[2] in organicMost and i[0] == p] #rawAll = [(i[1],i[3]) for i in raw] df3most = self.makeDF(raw3most, False, ['key_p', 'key_m', 'cnt_hour']) df3mostSum = df3most.groupby(['key_p', 'key_m']).sum() #print df3mostSum #print "df3mostSum" #dfAll = self.makeDF(rawAll, False, ['key2', 'cnt_hour_all']) #dfAllSum = dfAll.groupby('key2').sum() result = pd.concat([result, df3mostSum], axis=0) return result def analysisModel(self, sql, index, col): #print sql raw = self.redshift.getAll(sql) df = self.makeDF(raw, index, col) return df def makeConfidenceIntervalEstimation(self, df, col, tag=False, rate=0.3): """ tag表示是否是单边检测,还是中心检测,如果为True,且是单边检测,检测数据比率小于organic数据,可信度为1。默认是False """ dfData = df.fillna(0).reset_index().values u0 = [i[2] * 1.0 / i[1] for i in dfData if i[0] == 'Organic'][0] dfList = [[i[0], i[1], u0, i[2] * 1.0 / i[1], tag, rate] for i in dfData] result = map(self.confidenceIntervalEstimation, dfList) dfResult = self.makeDF(result, True, col) return dfResult def rateEstimation(self, l): product, media_source, n, u0, u, tag, rate = l if u0 == 0: u0 = 0.01 if u >= u0: if tag: return (product, media_source, 1.0) else: return (product, media_source, max((2 * u0 - u) / (u0), 0.1)) else: return (product, media_source, u / (u0)) def makeConfidenceRate(self, df, col, tag=False, rate=0.3): """ tag表示是否是单边检测,还是中心检测,如果为True,且是单边检测,检测数据比率小于organic数据,可信度为1。默认是False """ dfData = df.fillna(0).reset_index().values product_set = set([i[0] for i in dfData]) pdf = pd.DataFrame() for p in product_set: u0 = [ i[3] * 1.0 / i[2] for i in dfData if i[1] == 'Organic' and i[0] == p ][0] dfList = [[i[0], i[1], i[2], u0, i[3] * 1.0 / i[2], tag, rate] for i in dfData if i[0] == p] result = map(self.rateEstimation, dfList) dfResult = self.makeDF(result, True, col) pdf = pd.concat([pdf, dfResult], axis=0) return pdf def saveToMysql(self, df): data = df.fillna(0).reset_index().values print data sql = "insert into active_fraud_analysis (product_id,channel,day,install,cnt_ip,ip_rate,cnt_net,net_rate,cnt_imsi,imsi_rate,cnt_imei,imei_rate,cnt_hour,hour_rate,cnt_device,device_rate) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE install=values(install),cnt_ip=values(cnt_ip),ip_rate=values(ip_rate),cnt_net=values(cnt_net),net_rate=values(net_rate),cnt_imsi=values(cnt_imsi),imsi_rate=values(imsi_rate),cnt_imei=values(cnt_imei),imei_rate=values(imei_rate),cnt_hour=values(cnt_hour),hour_rate=values(hour_rate),cnt_device=values(cnt_device),device_rate=values(device_rate)" #print data if len(data[0]) == 16: values = [(i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7], i[8], i[9], i[10], i[11], i[12], i[13], i[14], i[15]) for i in data] else: values = [(i[0][0], i[0][1], i[1], i[2], i[3], i[4], i[5], i[6], i[7], i[8], i[9], i[10], i[11], i[12], i[13], i[14]) for i in data] self.mysql.insertMany(sql, values) def analysis(self): print 'BEGIN' rawData = self.analysisModel(self.sql_channel_date, False, ['key_p', 'key_m', 'datetime']) # 日期 print 'rawData' rawData = rawData.groupby(['key_p', 'key_m']).max() rawChannel = self.analysisModel(self.sql_channel_cnt, False, ['key_p', 'key_m', 'channel']) #总数 print 'rawChannel' rawChannel = rawChannel.groupby(['key_p', 'key_m']).max() rawIP = self.analysisModel(self.sql_channel_ip_cnt, False, ['key_p', 'key_m', 'cnt_IP']) #IP重复分布 rawIP = rawIP.groupby(['key_p', 'key_m']).max() print 'rawIP' rawNetwork = self.analysisModel(self.sql_channel_wifi_cnt, False, ['key_p', 'key_m', 'cnt_Net']) #网络分布 rawNetwork = rawNetwork.groupby(['key_p', 'key_m']).max() print 'rawNetwork' #rawIPseg = self.analysisModel(self.sql_channel_IPseg_cnt, True, ['cnt_IPseg']) #IP段分布 rawImsi = self.analysisModel(self.sql_channel_imsi_cnt, False, ['key_p', 'key_m', 'cnt_IMSI']) #IMSI为空比率 print 'rawImsi' rawImsi = rawImsi.groupby(['key_p', 'key_m']).max() rawImei = self.analysisModel(self.sql_channel_imei_cnt, False, ['key_p', 'key_m', 'cnt_IMEI']) #IMEI为空比率 rawImei = rawImei.groupby(['key_p', 'key_m']).max() rawHour = self.analysisHour() #时间分布 rawDevice = self.analysisDevice() #设备分布 device = pd.concat([rawChannel, rawDevice], axis=1) #dfDevice = self.makeConfidenceIntervalEstimation(device, ['device_rate'],rate=0.5) dfDevice = self.makeConfidenceRate(device, ['device_rate'], rate=0.5) ip = pd.concat([rawChannel, rawIP], axis=1) dfIP = self.makeConfidenceRate(ip, ['ip_rate'], True) netWork = pd.concat([rawChannel, rawNetwork], axis=1) dfNetwork = self.makeConfidenceRate(netWork, ['network_rate'], rate=0.3) #IPseg = pd.concat([rawChannel, rawIPseg], axis=1) #dfIPseg = self.makeConfidenceIntervalEstimation(IPseg, ['ipseg_rate']) hour = pd.concat([rawChannel, rawHour], axis=1) #dfHour = self.makeConfidenceIntervalEstimation(hour, ['hour_rate'], rate=0.5) dfHour = self.makeConfidenceRate(hour, ['hour_rate'], rate=0.5) #IpImsi = pd.concat([rawChannel, rawIpImsi], axis=1) #dfIpImsi = self.makeConfidenceIntervalEstimation(IpImsi, ['IpImsi_rate']) imsi = pd.concat([rawChannel, rawImsi], axis=1) dfImsi = self.makeConfidenceRate(imsi, ['imsi_rate'], rate=0.3) imei = pd.concat([rawChannel, rawImei], axis=1) dfImei = self.makeConfidenceRate(imei, ['imei_rate'], rate=0.3) result = pd.concat([ rawData, rawChannel, rawIP, dfIP, rawNetwork, dfNetwork, rawImsi, dfImsi, rawImei, dfImei, rawHour, dfHour, rawDevice, dfDevice ], axis=1) print result print 'result' self.saveToMysql(result) self.insertRetain() self.insertAvgReturn7day() self.insertLoyaltyUser() self.insertScore() self.modifyScore()
class ActiveFraud(object): """docstring for RedShift""" def __init__(self, ptid=None, begin=None, end=None): #读取配置文件 self.reader = gd.Reader('GeoIP2-City.mmdb') self.cf = ConfigParser.ConfigParser() self.cf.read('imsiConfig2.conf') self.redshift = RedShift() self.mysql = Mysql() #初始化开始结束日期,初始化产品id self.today = end if end else datetime.date.today().strftime("%Y-%m-%d") self.yesterday = begin if begin else ( datetime.date.today() - datetime.timedelta(days=1)).strftime("%Y-%m-%d") self.before_yesterday = ( datetime.datetime.strptime(str(self.yesterday), "%Y-%m-%d") + datetime.timedelta(days=-1)).strftime("%Y-%m-%d") self.ptid = ptid if ptid else '600025' print self.today print self.yesterday print self.ptid self.day_yesterday = self.yesterday.replace('-', '') #20100101 self.day_before_yesterday = self.getPlusDay(self.yesterday, -1) #20100101 self.day_7day_ago = self.getPlusDay(self.yesterday, -7) #20100101 #构建数据查询sql self.sql_channel_cnt = "select media_source,count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " group by media_source;" self.sql_channel_ip_cnt = "select media_source,sum(cnt) from (select media_source,ip,count(distinct token) as cnt from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " group by media_source,ip having count(distinct token)>=5) group by media_source;" self.sql_channel_ip_imsi = "select media_source,country_iso,city,carrier,ip,split_part(imsi,'_',2) as imsi from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + ";" self.sql_channel_imsi_cnt = "select media_source,sum(case when lenimsi=15 then 1 else 0 end) as cnt from (select distinct media_source,token,len(split_part(imsi,'_',2)) as lenimsi from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + ") t group by media_source;" self.sql_channel_imei_cnt = "select media_source,sum(case when lenimei=15 then 1 else 0 end) as cnt from (select distinct media_source,token,len(imei) as lenimei from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + ") t group by media_source;" self.sql_channel_hour_cnt = "select media_source,date_part(hour, install_time),count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " group by media_source,date_part(hour, install_time);" self.sql_channel_wifi_cnt = "select media_source,sum(case when wifi='t' then cnt else 0 end) as cnt from (select media_source,wifi,count(distinct token) as cnt from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " group by media_source,wifi) t group by media_source;" self.sql_channel_ver_cnt = "select media_source,ver,count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " group by media_source,ver;" self.sql_channel_device_cnt = "select media_source,device_brand,count(distinct token) from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + " group by media_source,device_brand;" self.sql_channel_IPseg_cnt = "select media_source,sum(cnt) from (select media_source,ip,count(token) as cnt from (select distinct media_source,token,split_part(ip,'.',1)||'.'||split_part(ip,'.',2)||'.'||split_part(ip,'.',3) as ip from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + ") t group by media_source,ip having count(token)>=5) t2 group by media_source;" self.sql_channel_date = "select distinct media_source,'" + self.yesterday + "','" + self.ptid + "' from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + ";" #构建t分布预测模型 y = np.array([0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.98]) x = np.array([0.674, 0.842, 1.036, 1.282, 1.645, 1.960, 2.326]) x = x.reshape(x.shape[0], 1) X = self.buildComplexDataset(x) alpha = 0.001 l1_ratio = 0.001 enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) self.model = enet.fit(X, y) def getPlusDay(self, day, plus): date = datetime.datetime.strptime( str(day), "%Y-%m-%d") + datetime.timedelta(days=plus) string = date.strftime("%Y%m%d") return string def insertLoyaltyUser(self, ): if self.ptid == '600025': loyalty_key = '600025_T05' elif self.ptid == '600020': loyalty_key = 'T01' elif self.ptid == '600022': loyalty_key = 'P01' else: return 0 sql_loyalty = "select day,media_source,product_id,sum(case when key_cnt>=3 then 1 else 0 end) as cnt,sum(case when key_cnt>=3 then 1 else 0 end)*1.0/count(*) as rate from (select distinct trunc(install_time) as day,media_source,product_id,token from view_appsflyer_postback_android_install_token where install_time>='" + self.yesterday + "' and install_time<'" + self.today + "' and product_id=" + self.ptid + ") t1 left outer join (select token,count(*) as cnt,count(case when sta_key='" + loyalty_key + "' then 1 else null end) as key_cnt from sta_event_game_publish where server_time>='" + self.yesterday + "' and server_time<'" + self.today + "' and product_id=" + self.ptid + " group by token) t2 on t1.token=t2.token group by day,media_source,product_id" sql_insert = "insert into active_fraud_analysis(day,channel,product_id,loyalty_users,loyalty_users_rate) values(%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE loyalty_users=values(loyalty_users),loyalty_users_rate=values(loyalty_users_rate)" raw = self.redshift.getAll(sql_loyalty) print sql_loyalty self.mysql.insertMany(sql_insert, raw) def insertRetain(self): """ 把隔日留存数据插进去 """ sql = "INSERT INTO active_fraud_analysis (day,channel,product_id,retain) select * from (select str_to_date(t1.date, '%Y%m%d') as day,t1.pid,t1.pdtid,(case when t2.cnt is null then 0 else t2.cnt end)*1.0/t1.cnt as rate from (select date,pid,pdtid,sum(match_num) cnt from ROI_install where pdtid=" + self.ptid + " and date='" + self.day_before_yesterday + "' group by pid,pdtid) t1 left outer join (select date,pid,pdtid,retain_day,sum(spec_retain) cnt from ROI_Retained where pdtid=" + self.ptid + " and date='" + self.day_before_yesterday + "' and retain_day='" + self.day_yesterday + "' group by pid) t2 on t1.date=t2.date and t1.pid=t2.pid) t on DUPLICATE KEY UPDATE retain=rate;" #print sql self.mysql.insertOne(sql, None) def insertAvgReturn7day(self): sql = "INSERT INTO active_fraud_analysis (day,channel,product_id,avg_return_7day) select * from (select str_to_date(t1.date, '%Y%m%d') as day,t1.pid,t1.pdtid,t2.cnt/(datediff(str_to_date('" + self.day_yesterday + "','%Y%m%d'),str_to_date(t1.date, '%Y%m%d'))*t1.cnt) rate from (select date,pid,pdtid,sum(match_num) cnt from ROI_install where pdtid=" + self.ptid + " and date>='" + self.day_7day_ago + "' and date<'" + self.day_yesterday + "' group by pid,pdtid,date) t1 left outer join (select date,pid,pdtid,sum(cnt) as cnt from (select date,pid,pdtid,retain_day,sum(spec_retain) cnt from ROI_Retained where pdtid=" + self.ptid + " and date>='" + self.day_7day_ago + "' and retain_day>date and retain_day<='" + self.day_yesterday + "' group by pid,retain_day,date) t group by date,pid,pdtid) t2 on t1.date=t2.date and t1.pid=t2.pid) t3 on DUPLICATE KEY UPDATE avg_return_7day=rate;" self.mysql.insertOne(sql, None) def insertScore(self): #sql = "insert into active_fraud_analysis(day,channel,product_id,score) select * from (select day,channel,product_id,log(loyalty_users_rate*100+1.0)*log(retain*100+1.0)*log(avg_return_7day*100+1.0)*log(ip_rate*net_rate*100+1.0)*log(imsi_rate*100+1.0)*log(imei_rate*100+1.0)*log(hour_rate*100+1.0)*log(device_rate*100+1.001) as score from active_fraud_analysis where day>='2017-02-18' and day<'2017-02-23') t ON DUPLICATE KEY UPDATE score=t.score;" #print sql #self.mysql.insertOne(sql,None) sql_new = "select day,channel,product_id,ifnull(loyalty_users_rate,0),ifnull(retain,0),ifnull(avg_return_7day,0),ifnull(net_rate,0),ifnull(imsi_rate,0),ifnull(imei_rate,0),ifnull(hour_rate,0),ifnull(device_rate,0) from active_fraud_analysis where day>='" + self.before_yesterday + "' and day<'" + self.today + "'" sql_insert = "insert into active_fraud_analysis(day,channel,product_id,score) values(%s,%s,%s,%s) ON DUPLICATE KEY UPDATE score=values(score)" raw = self.mysql.getAll(sql_new) for row in raw: print row[3:6] value = [row[i] for i in range(3)] rate = [row[i] for i in range(6, len(row))] scoreList = self.sigmoid(np.array(rate), True) score = reduce(lambda x, y: x * y, scoreList) * 2**( len(row) - 3) / 15 * 100 * row[3] * row[3] * row[4] * row[5] value.extend([score]) self.mysql.insertOne(sql_insert, value) return raw def sigmoid(self, X, useStatus): """ 归一化处理,把比率数据归一化到[0.5,1] """ if useStatus: return 1.0 / (1 + np.exp(-(X))) else: return X def makeDF(self, dt, ind=False, col=['key', 'cnt']): """ 数据以列表形式保存,其中的元素为元祖 [('a',1),('b',2)] """ if ind: indexs = [i[0] for i in dt] data = [i[1:] for i in dt] df = pd.DataFrame(data, index=indexs, columns=col) else: df = pd.DataFrame(dt, columns=col) return df def analysisIP_IMSI(self): """ 分析IP和IMSI的一致性,获取原始数据 """ raw = self.redshift.getAll(self.sql_channel_ip_imsi) result = map(self.judgeIP_IMSI, raw) a = [i for i in result if i[1] == 0] b = [i for i in result if i[1] == 1] dfa = self.makeDF(a, False, ['key', 'cnt_IPIMSI']) c = dfa.groupby('key').count() #dfb = self.makeDF(b, False, ['key','cntIPIMSI1']) #d = dfb.groupby('key').count() df = pd.concat([c], axis=1) return df def judgeIP_IMSI(self, l): """ 分析IMSI MCC和MNC白名单,验证其和IP国家的一致性。 """ media_source, country_iso, city, carrier, ip, imsi = l if not imsi or len(imsi) < 15: return (media_source, 0) try: response = self.reader.city(ip) #英文名 ipiso = response.country.iso_code #TW mcc = imsi[0:3] if self.cf.has_option('DB', mcc): value = self.cf.get('DB', mcc) countryList = [] mncList = value.split(',')[1:] if imsi[3:5] in mncList or imsi[3:6] in mncList: countryList = value.split(',')[0].split('#') if country_iso == ipiso and ipiso in countryList: return (media_source, 1) except Exception as err: print err return (media_source, 0) return (media_source, 0) def buildComplexDataset(self, x): temp = 1.0 / x X = np.hstack((x, temp)) return X def confidenceIntervalEstimation(self, l): """ 根据待评估数据的结果,评估其距organic的距离,计算器可信度。 """ media_source, n, u0, u, tag, rate = l if n < 50: #print [media_source, None] return [media_source, 0.01] if tag and u <= u0: return [media_source, 1.0] if u0 == u: return [media_source, 1.0] sigm = (u0 * (1 - u0))**rate se = sigm / (n**rate) x = abs(u - u0) / (se + random.random() / 10) #print x X_predict = np.array([x]) X_predict = X_predict.reshape(X_predict.shape[0], 1) X_predict = self.buildComplexDataset(X_predict) #print X_predict pred = self.model.predict(X_predict)[0] result = min(1, max(0.01, 1 - pred)) #print [media_source, result] return [media_source, result] def analysisHour(self): """ 统计各渠道各小时的激活数目,以organic为基准,计算人数最多的三个小时占比,统计其他渠道对应3小时的比例,用置信区间95%来衡量渠道的真实性。 """ raw = self.redshift.getAll(self.sql_channel_hour_cnt) #排序选出最大的三个小时 #print raw #print self.sql_channel_hour_cnt rawOrganic = [i for i in raw if i[0] == 'Organic'] rawOrganic.sort(key=lambda x: -x[2]) organicMost = [rawOrganic[i][1] for i in range(3)] #分别求总数和前三个月聚合累加 raw3most = [(i[0], i[2]) for i in raw if i[1] in organicMost] rawAll = [(i[0], i[2]) for i in raw] df3most = self.makeDF(raw3most, False, ['key1', 'cnt_hour']) df3mostSum = df3most.groupby('key1').sum() #dfAll = self.makeDF(rawAll, False, ['key2', 'cnt_hour_all']) #dfAllSum = dfAll.groupby('key2').sum() #计算出联合dataframe result = pd.concat([df3mostSum], axis=1) return result def analysisDevice(self): """ 分析设备分布 """ raw = self.redshift.getAll(self.sql_channel_device_cnt) rawOrganic = [i for i in raw if i[0] == 'Organic'] rawOrganic.sort(key=lambda x: -x[2]) organicMost = [rawOrganic[i][1] for i in range(3)] #分别求总数和前三个月聚合累加 raw3most = [(i[0], i[2]) for i in raw if i[1] in organicMost] rawAll = [(i[0], i[2]) for i in raw] df3most = self.makeDF(raw3most, False, ['key1', 'cnt_device']) df3mostSum = df3most.groupby('key1').sum() #dfAll = self.makeDF(rawAll, False, ['key2', 'cnt_device_all']) #dfAllSum = dfAll.groupby('key2').sum() #计算出联合dataframe result = pd.concat([df3mostSum], axis=1) return result def analysisModel(self, sql, index, col): #print sql raw = self.redshift.getAll(sql) df = self.makeDF(raw, index, col) return df def makeConfidenceIntervalEstimation(self, df, col, tag=False, rate=0.3): """ tag表示是否是单边检测,还是中心检测,如果为True,且是单边检测,检测数据比率小于organic数据,可信度为1。默认是False """ dfData = df.fillna(0).reset_index().values u0 = [i[2] * 1.0 / i[1] for i in dfData if i[0] == 'Organic'][0] dfList = [[i[0], i[1], u0, i[2] * 1.0 / i[1], tag, rate] for i in dfData] result = map(af.confidenceIntervalEstimation, dfList) dfResult = self.makeDF(result, True, col) return dfResult def saveToMysql(self, df): data = df.fillna(0).reset_index().values sql = "insert into active_fraud_analysis(channel,day,product_id,install,cnt_ip,ip_rate,cnt_net,net_rate,cnt_imsi,imsi_rate,cnt_imei,imei_rate,cnt_hour,hour_rate,cnt_device,device_rate) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE install=values(install),cnt_ip=values(cnt_ip),ip_rate=values(ip_rate),cnt_net=values(cnt_net),net_rate=values(net_rate),cnt_imsi=values(cnt_imsi),imsi_rate=values(imsi_rate),cnt_imei=values(cnt_imei),imei_rate=values(imei_rate),cnt_hour=values(cnt_hour),hour_rate=values(hour_rate),cnt_device=values(cnt_device),device_rate=values(device_rate)" #print data values = [(i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7], i[8], i[9], i[10], i[11], i[12], i[13], i[14], i[15]) for i in data] print "values" print values self.mysql.insertMany(sql, values) #from sqlalchemy import create_engine ##将数据写入mysql的数据库,但需要先通过sqlalchemy.create_engine建立连接,且字符编码设置为utf8,否则有些latin字符不能处理 #yconnect = create_engine('mysql+mysqldb://datauser:ebOUeWSin3hAAKKD@test-dataverse-web.c0poh9vgjxya.rds.cn-north-1.amazonaws.com.cn:3306/dataverse?charset=utf8') #pd.io.sql.to_sql(df, 'zhx_test', yconnect, flavor='mysql', schema='dataverse', if_exists='append') #pd.io.sql.to_sql(rawChannel,'test_zhx', yconnect, flavor='mysql', schema='dataverse', if_exists='append') def analysis(self): print 'BEGIN' rawData = self.analysisModel(self.sql_channel_date, True, ['datetime', 'product_id']) # 日期 rawChannel = self.analysisModel(self.sql_channel_cnt, True, ['channel']) #总数 rawIP = self.analysisModel(self.sql_channel_ip_cnt, True, ['cnt_IP']) #IP重复分布 rawNetwork = self.analysisModel(self.sql_channel_wifi_cnt, True, ['cnt_Net']) #网络分布 #rawIPseg = self.analysisModel(self.sql_channel_IPseg_cnt, True, ['cnt_IPseg']) #IP段分布 rawImsi = self.analysisModel(self.sql_channel_imsi_cnt, True, ['cnt_IMSI']) #IMSI为空比率 rawImei = self.analysisModel(self.sql_channel_imei_cnt, True, ['cnt_IMEI']) #IMEI为空比率 #rawIpImsi = self.analysisIP_IMSI() #IP、IMSI异常分布 #print rawIpImsi rawHour = self.analysisHour() #时间分布 rawDevice = self.analysisDevice() #设备分布 device = pd.concat([rawChannel, rawDevice], axis=1) dfDevice = self.makeConfidenceIntervalEstimation(device, ['device_rate'], rate=0.5) ip = pd.concat([rawChannel, rawIP], axis=1) dfIP = self.makeConfidenceIntervalEstimation(ip, ['ip_rate'], True) netWork = pd.concat([rawChannel, rawNetwork], axis=1) dfNetwork = self.makeConfidenceIntervalEstimation(netWork, ['network_rate'], rate=0.3) #IPseg = pd.concat([rawChannel, rawIPseg], axis=1) #dfIPseg = self.makeConfidenceIntervalEstimation(IPseg, ['ipseg_rate']) hour = pd.concat([rawChannel, rawHour], axis=1) dfHour = self.makeConfidenceIntervalEstimation(hour, ['hour_rate'], rate=0.5) #IpImsi = pd.concat([rawChannel, rawIpImsi], axis=1) #dfIpImsi = self.makeConfidenceIntervalEstimation(IpImsi, ['IpImsi_rate']) imsi = pd.concat([rawChannel, rawImsi], axis=1) dfImsi = self.makeConfidenceIntervalEstimation(imsi, ['imsi_rate'], rate=0.3) imei = pd.concat([rawChannel, rawImei], axis=1) dfImei = self.makeConfidenceIntervalEstimation(imei, ['imei_rate'], rate=0.3) result = pd.concat([ rawData, rawChannel, rawIP, dfIP, rawNetwork, dfNetwork, rawImsi, dfImsi, rawImei, dfImei, rawHour, dfHour, rawDevice, dfDevice ], axis=1) self.saveToMysql(result) self.insertRetain() self.insertAvgReturn7day() self.insertLoyaltyUser() print result #new test result = self.insertScore() return result
# In[1]: import numpy as np import pandas as pd import os import time import datetime from redshiftpool import RedShift from mysqlpool import Mysql # In[3]: redshift = RedShift() #mysql = Mysql() # In[4]: #sql_train_neg = "select c.token,to_char(client_time,'YYYY-MM-DD HH24:MI:SS'),c.sta_key,to_char(create_time,'YYYY-MM-DD HH24:MI:SS') from (select b.* from (select token from sta_new_user where server_time>='2017-06-01' and server_time<'2017-06-06' and product_id=600027) a ,(select * from (select token,client_time,sta_key,ROW_NUMBER () OVER (PARTITION BY token order by client_time asc) as row from sta_event_game_publish where server_time>='2017-06-01' and server_time<'2017-06-11' and product_id=600027 and substring(sta_key,1,1)='T') t where row<=30) b where a.token=b.token) c left join (select token,min(create_time) as create_time from game_iap where create_time>='2017-06-01' and product_id=600027 group by token)d on c.token=d.token where d.token is null;" sql_train_neg = "select c.token,to_char(client_time,'YYYY-MM-DD HH24:MI:SS'),c.sta_key,to_char(create_time,'YYYY-MM-DD HH24:MI:SS') from (select b.* from (select token from sta_new_user where server_time>='2017-07-01' and server_time<'2017-07-06' and product_id=600027) a ,(select * from (select token,client_time,sta_key,ROW_NUMBER () OVER (PARTITION BY token order by client_time asc) as row from sta_event_game_publish where server_time>='2017-07-01' and server_time<'2017-07-11' and product_id=600027 and substring(sta_key,1,1)='T') t where row<=30) b where a.token=b.token) c left join (select token,min(create_time) as create_time from game_iap where create_time>='2017-07-01' and product_id=600027 group by token)d on c.token=d.token where d.token is null;" sql_train_pos = "select c.token,to_char(client_time,'YYYY-MM-DD HH24:MI:SS'),c.sta_key,to_char(create_time,'YYYY-MM-DD HH24:MI:SS') from (select b.* from (select token from sta_new_user where server_time>='2017-03-01' and product_id=600027) a ,(select * from (select token,client_time,sta_key,ROW_NUMBER () OVER (PARTITION BY token order by client_time asc) as row from sta_event_game_publish where server_time>='2017-03-01' and product_id=600027 and substring(sta_key,1,1)='T') t where row<=30) b where a.token=b.token) c join (select token,min(create_time) as create_time from game_iap where create_time>='2017-04-01' and product_id=600027 group by token)d on c.token=d.token;" # In[5]: