def export_data(): DAU_LOG = {} populate_DAU(datetime.datetime(2012,2,20),datetime.datetime(2012,11,1),DAU_LOG) csw = CommonCsvWriter('stat_HARDCODE-1024') csw.write_header([u'注册月份',u'追踪月份',u'has_read',u'has_write',u'friends count','feed']) for month in range(1,13): target_users = get_target_users_by_month(month) for m in range(3,11): start_month = datetime.datetime(2012,m,1) end_month = datetime.datetime(2012,(m + 1),1) start_timestamp = stat_util.convert_datetime_to_timestamp(start_month) end_timestamp = stat_util.convert_datetime_to_timestamp(end_month) last_day_of_month = end_month - datetime.timedelta(days = 1,hours = 8) for user in target_users: uid = user.id is_read_activtity = 'YES' if is_readactivity(uid,start_month,end_month,DAU_LOG) else 'NO' is_write_activity = 'YES' if is_writeactivity(uid,start_month,end_month) else 'NO' friends = get_friends_and_counts(uid,last_day_of_month) seen_feed = 0 for f in friends: c = crab.user_post[f].find(R.type.in_([1,2,3,5,7]) & ( R.created_on >= start_timestamp) & (R.created_on < end_timestamp)).count() seen_feed += c csw.write_onerow([month,m,is_read_activtity,is_write_activity,len(friends),seen_feed]) csw.end_write()
def export_csv(aid_data): csw = CommonCsvWriter(filename='./output/stat_HARDCORE-1005') csw.write_header(['date','aid','guid','guid_name','chechin No','view','click','share']) csv_body = [] keylist = aid_data.keys() keylist.sort() for key in keylist: aid = key datas = aid_data.get(key) for data in datas: d_key_list = data.keys() d_key_list.sort() date = data['date_str'] for d_key in d_key_list: if d_key == 'date_str': continue guid = d_key checkin_c = get_post_count_by_guid(guid) loc_name = get_location(guid) num = data.get(d_key) #print aid,guid,num csv_body.append([date,aid,guid,loc_name,checkin_c,num[0],num[1],num[2]]) csw.write_body(csv_body) csw.end_write()
def export_csv(uids): csw = CommonCsvWriter(filename='./output/stat_HARDCORE-983') csw.write_header([u'aid','guid','guid_name','chechin No','view','click','share']) csv_body = [] start_date = datetime.datetime(2012,9,30) - datetime.timedelta(hours = 8) end_date = datetime.datetime(2012,10,30) + datetime.timedelta(hours = 16) start = stat_util.convert_datetime_to_timestamp(start_date) end = stat_util.convert_datetime_to_timestamp(end_date) for uid in uids: sina = utils.get_weibo_info(uid) friends_count = get_f_count(uid) user = user_get(uid) user_name = user['name'] user_point = user['points_total'] user_posts = [] check_in_count = 0 for p in crab.user_post[uid].find( (R.created_on >=start) & (R.created_on < end) & (R.location_id > 0)): created_on = p['created_on'] if not check_in_count: check_in_count = crab.user_post[uid].find( (R.created_on >=start) & (R.created_on < end) & (R.location_id > 0)).count() checkin_date = datetime.datetime.fromtimestamp(created_on) post = db_slave.post.find_one({'_id':p['post_id']}) loc = location_get(post['l'], 'basic') loc_name = loc['name'] city = city_get(p['city']) or 'N/A' if city: city= city['name'] body = post['b'] photo = db.photo.find_one({'p':p['post_id']}) has_photo = 'YES' if photo else 'NO' photo_link = get_photo_url(photo) + '?size=500&style=1&quality=high' if photo else 'N/A' sina_weibo = 'YES' if sina else 'N/A' sina_name = sina['screen_name'] if sina else 'N/A' sina_url = 'http://weibo.com/u/%d' % sina['id'] if sina else 'N/A' csw.write_onerow([user_name,uid,user_point,checkin_date,loc_name,check_in_count,friends_count,city,body,has_photo,photo_link , sina_weibo,sina_name ,sina_url]) keylist = aid_data.keys() keylist.sort() for key in keylist: aid = key datas = aid_data.get(key) for data in datas: d_key_list = data.keys() d_key_list.sort() for d_key in d_key_list: guid = d_key checkin_c = get_post_count_by_guid(guid) loc_name = get_location(guid) num = data.get(d_key) #print aid,guid,num csv_body.append([aid,guid,loc_name,checkin_c,num[0],num[1],num[2]]) #csw.write_body(csv_body) csw.end_write()
def export_csv(guids): csw = CommonCsvWriter(filename='./output/stat_HARDCORE-1023_new') csw.write_header([u'guid','city','poi']) csv_body = [] for guid in guids: info = guid.split(':') row = [i for i in info] csw.write_onerow(row) csw.end_write()
def stat_guid_lists(target_guids): #start_date = datetime.datetime.utcnow() - datetime.timedelta(days = 100) #stat_timestamp = stat_util.convert_datetime_to_timestamp(start_date) #user_ids = set() guid_list = [] csw = CommonCsvWriter(filename='./output/stat_nail_beauty') csw.write_header(['guid','checkin_user_no']) for guid in target_guids: csw.write_onerow(guid) print guid csw.end_write() return guid_list
def retrive_guids(before = datetime.datetime.utcnow() - datetime.timedelta(days = 100)): before = datetime.datetime.utcnow() - datetime.timedelta(days = 100) before = datetime.datetime(*before.timetuple()[:3]) #end = get_id('locations', 'created_on', before) guids = set() #pre_sql = "select l.guid FROM locations l limit %s,%s;" #param = ('',str(1000)) #print end #sql = 'select guid from locations l where l.guid < ' offset = 0 limit = 50000 flag = True retrieve_count = 0 csw = CommonCsvWriter(filename='./output/stat_nail_beauty') csw.write_header([u'found from','guid','poi_name','tip_id',u'tip_content']) while retrieve_count < 237442: for guid,name in session.query(Location.guid,Location.name).offset(offset).limit(limit): if '指甲' in name or '美甲' in name or 'nail' in name or 'nail beauty' in name: guids.add((guid,name)) csw.write_onerow(('poi name',guid,name)) print 'hited by poi name',guid,name for tip in crab.location_post[guid_to_int(guid)].find(R.type==2): pid = tip['post_id'] post = db_slave.post.find_one({'_id':pid}) if post and 'b' in post: body = post['b'] body = body.replace('\r').replace('\n') if '指甲' in body or '美甲' in body or 'nail' in body or 'nail beauty' in body: guids.add((guid,name)) csw.write_onerow(('tip',guid,name,pid,body)) print 'hited by tip',guid,name retrieve_count += 1 print len(guids) offset += limit csw.end_write() return guids
def get_target_guids(): guids = set() taget_city = [u'北京',u'上海',u'广州',u'深圳',u'成都'] csw = CommonCsvWriter(filename='./output/stat_HARDCORE-1023_new') csw.write_header([u'guid','city','poi','checkin times']) for r in db_slave.locations_categories_2.find({'cat.id':{'$in':['0403','0401','0103']}}): guid = r['_id'] location = location_get(guid) #print location if location and 'city' in location and location['city'] in taget_city: check_in_count = crab.location_post[guid_to_int(guid)].find().count() if check_in_count >= 100: guids.add(str(guid) + ':' + location['city']+':'+location['name']) csw.write_onerow((guid,location['city'],location['name'],check_in_count)) csw.end_write() return sorted(guids)
def export_csv(guids): csw = CommonCsvWriter(filename='./output/stat_HM') csw.write_header(['date','aid','guid','guid_name','chechin No','view','click','share']) csv_body = [] for guid in guids: posts = get_posts_by_guid(guid, detail=True) for post in posts: row = [post['u'],post['l'],post['c'],post['b'].replace('\r','').replace('\n','')] has_photo = 'Yes' if db_slave.photo.find_one({'p':post['_id']}) else 'No' row.append(has_photo) url = 'http://jiepang.com/user/story?pid=%s' % str(post['_id']) row.append(url) csv_body.append(row) csw.write_body(csv_body) csw.end_write()
def stat_guid_lists(target_guids): #start_date = datetime.datetime.utcnow() - datetime.timedelta(days = 100) #stat_timestamp = stat_util.convert_datetime_to_timestamp(start_date) #user_ids = set() guid_list = [] csw = CommonCsvWriter(filename='./output/stat_checkin_lte5') csw.write_header(['guid','checkin_user_no']) for guid in target_guids: int_guid = guid_to_int(guid) count = crab.location_post[int_guid].find().group(R.user_id).count() if count < 5: guid_list.append((guid,count)) csw.write_onerow((guid,count)) print guid csw.end_write() return guid_list
def export_csv(guids): csw = CommonCsvWriter(filename='./output/stat_HARDCORE-989') csw.write_header([u'aid','guid','guid_name','chechin No','view','click','share']) csv_body = [] for guid in guids: checkin_count = crab.location_post[guid_to_int(guid)].find(R.type.in_([1,3,7,10])).count() user_count = crab.location_post[guid_to_int(guid)].find(R.type.in_([1,3,7,10])).group(R.user_id).count() photo_count = crab.location_post[guid_to_int(guid)].find(R.type.in_([1,3,7,10]) & R.has_photo).count() #pid = crab.location_post[int(guid,16)].find(R.type.in_([1,3,7,10]))[0]['post_id'] location = location_get(guid) if location: row = [location['name'],guid,checkin_count,user_count,photo_count,location['city']] print row csw.write_onerow(row) csw.end_write()
def export_csv(guids): csw = CommonCsvWriter(filename="./output/stat_HARDCORE-990-1") csw.write_header([u"aid", "guid", "guid_name", "chechin No", "view", "click", "share"]) csv_body = [] start_time = datetime.datetime(2012, 6, 14) - datetime.timedelta(hours=8) start = stat_util.convert_datetime_to_timestamp(start_time) end_time = datetime.datetime(2012, 11, 14) + datetime.timedelta(hours=16) end = stat_util.convert_datetime_to_timestamp(end_time) for guid in guids: _checkin_count = crab.location_post[guid_to_int(guid)].find(R.type.in_([1, 3, 7, 10])).count() _user_count = crab.location_post[guid_to_int(guid)].find(R.type.in_([1, 3, 7, 10])).group(R.user_id).count() _photo_count = crab.location_post[guid_to_int(guid)].find((R.has_photo) & R.type.in_([1, 3, 7, 10])).count() checkin_count = ( crab.location_post[guid_to_int(guid)] .find(R.type.in_([1, 3, 7, 10]) & (R.created_on >= start) & (R.created_on < end)) .count() ) user_count = ( crab.location_post[guid_to_int(guid)] .find(R.type.in_([1, 3, 7, 10]) & (R.created_on >= start) & (R.created_on < end)) .group(R.user_id) .count() ) photo_count = ( crab.location_post[guid_to_int(guid)] .find(R.type.in_([1, 3, 7, 10]) & (R.has_photo) & (R.created_on >= start) & (R.created_on < end)) .count() ) # pid = crab.location_post[int(guid,16)].find(R.type.in_([1,3,7,10]))[0]['post_id'] location = location_get(guid) if location: # row = [guid,location['name'],checkin_count,user_count,photo_count,' ',_checkin_count,_user_count,_photo_count] row = [ guid, location["name"], _checkin_count, _user_count, _photo_count, " ", checkin_count, user_count, photo_count, ] print row csw.write_onerow(row) csw.end_write() csw = CommonCsvWriter(filename="./output/stat_HARDCORE-990-2") csw.write_header([u"aid", "guid", "guid_name", "chechin No", "view", "click", "share"]) csv_body = [] start_date = datetime.datetime(2012, 6, 14) end_date = datetime.datetime(2012, 11, 14) for i in range(0, (end_date - start_date).days + 1): _day = start_date + datetime.timedelta(days=i) start_time = _day - datetime.timedelta(hours=8) end_time = _day + datetime.timedelta(hours=16) start = stat_util.convert_datetime_to_timestamp(start_time) end = stat_util.convert_datetime_to_timestamp(end_time) checkin_count = 0 user_count = 0 photo_count = 0 for guid in guids: checkin_count += ( crab.location_post[guid_to_int(guid)] .find(R.type.in_([1, 3, 7, 10]) & (R.created_on >= start) & (R.created_on < end)) .count() ) user_count += ( crab.location_post[guid_to_int(guid)] .find(R.type.in_([1, 3, 7, 10]) & (R.created_on >= start) & (R.created_on < end)) .group(R.user_id) .count() ) photo_count += ( crab.location_post[guid_to_int(guid)] .find(R.type.in_([1, 3, 7, 10]) & (R.has_photo) & (R.created_on >= start) & (R.created_on < end)) .count() ) row = [_day, checkin_count, user_count, photo_count] print row csw.write_onerow(row) csw.end_write()
def get_target_guids(): # guids = [] return stat_util.get_vertical_list_from_csv(INPUT_FILE_PATH, 2) def stat_user_lists(target_guids): start_date = datetime.datetime.utcnow() - datetime.timedelta(days=60) stat_timestamp = stat_util.convert_datetime_to_timestamp(start_date) user_ids = set() for guid in target_guids: int_guid = guid_to_int(guid) for r in crab.location_post[int_guid].find(R.created_on > stat_timestamp).group(R.user_id): user_ids.add(r["user_id"]) return user_ids if __name__ == "__main__": target_guids = get_target_guids() # print target_guids user_ids = stat_user_lists(target_guids) csw = CommonCsvWriter("subway_userlist") csw.write_header([u"user_id"]) csv_body = [] for uid in user_ids: csv_body.append([uid]) csw.write_body(csv_body) csw.end_write()
def export_csv(type = 'api'): #ProfilingLog #ClientLog test_pro = './input/looper_logs/api_server/profiling.log' test_client = './input/looper_logs/client/all_client.log' test_length = './input/looper_logs/api_length/length.log' test_img = './input/looper_logs/img_server/profiling.log' export_type = 'csv' pdatas = ProfilingLog.read_rows(test_pro) cdatas = ClientLog.read_rows(test_client) result = ProfilingLog.compare_to_client_log(pdatas,cdatas[1]) lengthlog = LengthLog.read_rows(test_length) LengthLog.append_length_to_data(lengthlog,result) imglog = ImgLog.read_rows(test_img) img_result = ImgLog.compare_to_client_log(imglog, cdatas[1]) if export_type == 'json': result_data = {'items':[]} else: result_data = [] csw = CommonCsvWriter('compare_result_apiserver_with_client.csv') csw.write_header(['aa']) csw_img = CommonCsvWriter('compare_result_apiserver_with_client_img.csv') csw_img.write_header(['aa']) # apiserver loop for key in result: data = {} if type == 'api': data['req_id'] = key #row.append(key) log = result.get(key) req_id = log['client_log'][ClientLog.REQ_ID] req_type = log['client_log'][ClientLog.TYPE] is_wifi = u'yes' if log['client_log'][ClientLog.IS_WIFI] else 'no' client_spend_time = log['client_log'][ClientLog.RES_TIME] #print (len(log['profiling_log'])) #print log['profiling_log'] for profiling_log in log['profiling_log']: row = [] req_api = profiling_log[ProfilingLog.REQ_API] api_server_received_time = profiling_log[ProfilingLog.REQ_TIME] api_server_req_ip = profiling_log[ProfilingLog.REQ_IP] api_server_spd_time = profiling_log[ProfilingLog.SPD_TIME] api_server_req_uid = profiling_log[ProfilingLog.RED_UID] #print profiling_log api_server_version = profiling_log[ProfilingLog.REQ_VER] api_server_response_length = log['length_log'][LengthLog.LENGTH] row.append(req_id) row.append(req_type) row.append(is_wifi) row.append(client_spend_time) row.append(req_api) row.append(api_server_received_time) row.append(api_server_req_ip) row.append(api_server_spd_time) row.append(api_server_req_uid) row.append(api_server_version) row.append(api_server_response_length) csw.write_onerow(row) """ imglogs = img_result.get(key) if not imglogs: continue req_id = imglogs['client_log'][ClientLog.REQ_ID] req_type = imglogs['client_log'][ClientLog.TYPE] is_wifi = u'yes' if imglogs['client_log'][ClientLog.IS_WIFI] else 'no' client_spend_time = log['client_log'][ClientLog.RES_TIME] for img_log in imglogs['img_log']: img_spend_time = img_log[ImgLog.SPD_TIME] img_length = img_log[ImgLog.LENGTH] row.append(req_id) row.append(req_type) row.append(is_wifi) row.append(client_spend_time) row.append(img_spend_time) row.append(img_length) csw_img.write_onerow(row) """ """ if export_type == 'json': data['profiling_log'] = log['profiling_log'] data['client_log'] = log['client_log'] data['length_log'] = log['length_log'] result_data['items'].append(data) else: data['length'] = log['length_log'][LengthLog.LENGTH] result_data.append(data) """ for key in img_result: imglogs = img_result.get(key) if not imglogs: continue req_id = imglogs['client_log'][ClientLog.REQ_ID] if not req_id: continue req_type = imglogs['client_log'][ClientLog.TYPE] is_wifi = u'yes' if int(imglogs['client_log'][ClientLog.IS_WIFI]) else 'no' client_spend_time = imglogs['client_log'][ClientLog.RES_TIME] for img_log in imglogs['img_log']: img_type = img_log[ImgLog.TYPE] img_spend_time = img_log[ImgLog.SPD_TIME] img_length = img_log[ImgLog.LENGTH] row = [] row.append(req_id) row.append(req_type) row.append(is_wifi) row.append(client_spend_time) row.append(img_spend_time) row.append(img_type) row.append(img_length) csw_img.write_onerow(row) csw.end_write() csw_img.end_write() return result_data