Esempio n. 1
0
def export_data():
    DAU_LOG = {}
    populate_DAU(datetime.datetime(2012,2,20),datetime.datetime(2012,11,1),DAU_LOG)
    
    csw = CommonCsvWriter('stat_HARDCODE-1024')
    csw.write_header([u'注册月份',u'追踪月份',u'has_read',u'has_write',u'friends count','feed'])
    for month in range(1,13):
        target_users = get_target_users_by_month(month)
        for m in range(3,11):
            start_month =  datetime.datetime(2012,m,1)
            end_month = datetime.datetime(2012,(m + 1),1)
            start_timestamp = stat_util.convert_datetime_to_timestamp(start_month)
            end_timestamp = stat_util.convert_datetime_to_timestamp(end_month)
            last_day_of_month = end_month - datetime.timedelta(days = 1,hours = 8) 
            for user in target_users:
                uid = user.id
                is_read_activtity = 'YES' if is_readactivity(uid,start_month,end_month,DAU_LOG) else 'NO'
                is_write_activity = 'YES' if is_writeactivity(uid,start_month,end_month) else 'NO'
                friends = get_friends_and_counts(uid,last_day_of_month)
                seen_feed = 0
                for f in friends:
                    c = crab.user_post[f].find(R.type.in_([1,2,3,5,7]) & ( R.created_on >= start_timestamp) & (R.created_on < end_timestamp)).count()
                    seen_feed += c
                csw.write_onerow([month,m,is_read_activtity,is_write_activity,len(friends),seen_feed])
    csw.end_write()
Esempio n. 2
0
def export_csv(uids):
    csw = CommonCsvWriter(filename='./output/stat_HARDCORE-983')
    csw.write_header([u'aid','guid','guid_name','chechin No','view','click','share'])
    csv_body = []
    
    start_date = datetime.datetime(2012,9,30) - datetime.timedelta(hours = 8)
    end_date = datetime.datetime(2012,10,30) + datetime.timedelta(hours = 16)
    start = stat_util.convert_datetime_to_timestamp(start_date)
    end = stat_util.convert_datetime_to_timestamp(end_date)
    for uid in uids:
        sina = utils.get_weibo_info(uid)
        friends_count = get_f_count(uid)
        user = user_get(uid)
        user_name = user['name']
        user_point = user['points_total']
        user_posts = []
        check_in_count = 0
        for p in crab.user_post[uid].find( (R.created_on >=start) & (R.created_on < end) & (R.location_id > 0)):
            created_on = p['created_on']
            if not check_in_count:
                check_in_count = crab.user_post[uid].find( (R.created_on >=start) & (R.created_on < end) & (R.location_id > 0)).count()
            checkin_date = datetime.datetime.fromtimestamp(created_on)
            post = db_slave.post.find_one({'_id':p['post_id']})
            loc = location_get(post['l'], 'basic')
            loc_name = loc['name']
            city = city_get(p['city']) or 'N/A'
            if city:
                city= city['name'] 
            
            body = post['b']
            photo = db.photo.find_one({'p':p['post_id']})
            has_photo = 'YES' if photo else 'NO'
            photo_link = get_photo_url(photo) + '?size=500&style=1&quality=high' if photo else 'N/A'
            sina_weibo = 'YES' if sina else 'N/A'
            sina_name = sina['screen_name'] if sina else 'N/A'
            sina_url = 'http://weibo.com/u/%d' % sina['id'] if sina else 'N/A'
            csw.write_onerow([user_name,uid,user_point,checkin_date,loc_name,check_in_count,friends_count,city,body,has_photo,photo_link ,
                             sina_weibo,sina_name ,sina_url])
    
    
    keylist = aid_data.keys()
    keylist.sort()
    for key in keylist:
        aid = key
        datas = aid_data.get(key)
        for data in datas:
            d_key_list = data.keys()
            d_key_list.sort()
            for d_key in d_key_list:
                guid = d_key
                checkin_c = get_post_count_by_guid(guid)
                loc_name = get_location(guid)
                num = data.get(d_key)
                #print aid,guid,num
                csv_body.append([aid,guid,loc_name,checkin_c,num[0],num[1],num[2]])
           
    #csw.write_body(csv_body) 
    csw.end_write()
Esempio n. 3
0
def get_post_count_by_guid(guid):
    end = stat_util.convert_datetime_to_timestamp(datetime.datetime(2012,11,1) - datetime.timedelta(hours = 8))
    start = stat_util.convert_datetime_to_timestamp(datetime.datetime(2012,9,30) - datetime.timedelta(hours = 8))
    
    r = crab.location_post[guid_to_int(guid)].find((R.type.in_([1,3,7,10])) & (R.created_on > start)& (R.created_on < end)).count()
    #r += crab.location_post[guid_to_int(guid)].find( R.type==7& R.created_on > start& R.created_on < end).count()
    #r += crab.location_post[guid_to_int(guid)].find( R.type==10& R.created_on > start& R.created_on < end).count()
    print 'get_post_count_by_guid',guid,r
    return r
Esempio n. 4
0
def get_user_posts(uid,start_date,end_date,all_posts):
    crab_cond = [R.privacy != 1] #非【仅自己可见】 
    crab_cond.append(R.location_id)
    if start_date:
        start_timestamp = stat_util.convert_datetime_to_timestamp(start_date)
        crab_cond.append(R.created_on >= start_timestamp)
    if end_date:
        end_timestamp = stat_util.convert_datetime_to_timestamp(end_date)
        crab_cond.append(R.created_on <= end_timestamp)
    
    posts_by_month = [[] for r in range(0,12)]
    print crab.user_post[uid].find(*crab_cond).count()
    for crab_post in  crab.user_post[uid].find(*crab_cond):
        pid = crab_post['post_id']
        #mongo_post = db_slave.post.find_one({'_id':pid}) #"nl"= number of likes;"nc" = number of comments
        mongo_post = None
	if all_posts and all_posts.get(pid):
            mongo_post = all_posts.get(pid)

            #mongo_post = db_slave.post.find_one({'_id':pid})
        if not mongo_post:
            mongo_post = db_slave.post.find_one({'_id':pid})
        if not mongo_post:
            continue
        
        guid = mongo_post.get('l')
        if __is_virtual_loc(guid):
            continue
        has_photo = 1 if crab_post['has_photo'] == -1 else 0
        if mongo_post and not mongo_post.get('del'):
            created_on = mongo_post.get('c')
            week_idx = (created_on + datetime.timedelta(hours=8)).month - 1
            total_feedback = (mongo_post.get('nc') or 0 )+ (mongo_post.get('nl') or 0)
            mongo_post['total_feedback'] = total_feedback
            mongo_post['has_photo'] = has_photo
            posts_by_month[week_idx].append(mongo_post)
            
    # sort fetched posts
    for idx in range(0,12):
        posts_by_month[idx] = sorted(posts_by_month[idx],key=itemgetter('has_photo','total_feedback'),reverse=True) # sort in DESC
        #posts_by_month[idx] = sorted(posts_by_month[idx],key=lambda post: post.get('has_photo'))
        posts_by_month[idx] = posts_by_month[idx][:3] # truncate number of elements to 3
        posts_by_month[idx] = sorted(posts_by_month[idx],key=itemgetter('c')) # sort by created_on in ASC
        
    #print posts_by_month
    return posts_by_month
Esempio n. 5
0
def stat_user_lists(target_guids):
    start_date = datetime.datetime.utcnow() - datetime.timedelta(days=60)
    stat_timestamp = stat_util.convert_datetime_to_timestamp(start_date)
    user_ids = set()
    for guid in target_guids:
        int_guid = guid_to_int(guid)
        for r in crab.location_post[int_guid].find(R.created_on > stat_timestamp).group(R.user_id):
            user_ids.add(r["user_id"])

    return user_ids
Esempio n. 6
0
def is_writeactivity(uid,start_month,end_month):
    start = start_month - datetime.timedelta(hours = 8)
    end = end_month - datetime.timedelta(hours = 8)
    start = stat_util.convert_datetime_to_timestamp(start)
    end = stat_util.convert_datetime_to_timestamp(end)
    return crab.user_post[uid].find((R.created_on >= start) & (R.created_on <end).count()) > 0
Esempio n. 7
0
def create_cate_data_monthly():
    
    start_month = 12
    end_month = 12
    guid_cate = fetch_all_loc_cate()
    user_cate = {}
    exists_count = 0
    cursor = db_slave.stat_journal_2012.find({},{'_id':1,'mc':1},timeout = False)
    try:
        for r in cursor:
            if 'mc' not in r:
                continue
	    old_mc = r['mc']
            exists_count += 1 
            uid = r['_id']
            user_cate[uid] = [[]for r in range(0,12)]
            for month in range(start_month,end_month + 1):
                start_date = datetime.datetime(2012,month,1) -  datetime.timedelta(hours = 8)
                end_date = start_date + datetime.timedelta(days = 31)
                if end_date.month != month:
                    end_date = start_date + datetime.timedelta(days = 30)
                if end_date.month != month:
                    end_date = start_date + datetime.timedelta(days = 29)
                
                start_timestamp = stat_util.convert_datetime_to_timestamp(start_date)
                end_timestamp = stat_util.convert_datetime_to_timestamp(end_date)
                cate_count_dict = {}
                for p in crab.user_post[uid].find( (R.privacy != 1) & R.location_id & (R.created_on >= start_timestamp) & (R.created_on <= end_timestamp)).group(R.location_id):
                    loc_id = p['location_id']
                    #pid = p['post_id']
                    #mongo_post = db_slave.post.find_one({'_id':pid}) or {}
                    #guid = mongo_post.get('l')
                    guid ='%x' % loc_id 
                    if not guid:
                        continue
                    short_guid = guid[:14]
                    short_guid = short_guid.upper()
                    #print short_guid
                    #categories = db_slave.locations_categories_2.find_one({'_id':short_guid}) or {}
                    #cate = categories.get('cat')[0] if 'cat' in categories and len(categories['cat']) > 0 else ''
                    cate = guid_cate.get(short_guid)
                    if not cate:
                        continue
                    cate_id = cate.get('id') or None
                    if not cate_id:
                        continue
                    
                    if cate_id not in cate_count_dict:
                        cate_count_dict[cate_id] = 0
                    cate_count_dict[cate_id] = cate_count_dict[cate_id] + 1
                frequently_cate = []
                if cate_count_dict:
                #find 3 gone frequently categories
                    while True:
                        if len(frequently_cate) > 2 or len(frequently_cate) > len(cate_count_dict):
                            break
                        max_cate ,max_count = max(cate_count_dict.iteritems(),key = lambda cate:cate[1])
                        del cate_count_dict[max_cate]
                        #frequently_cate[max_cate] = max_count
                        frequently_cate.append({'cid':max_cate,'c':max_count})
                #print cate_count_dict
                
                user_cate[uid][month - 1] = frequently_cate
                    
                #max_locations = []
                #while True:
                #    if len(max_locations) >= 3
            
            print uid
	    if user_cate[uid][11]:
		
		if type(old_mc) != list:
		    db.stat_journal_2012.update({'_id': uid}, {'$set': {'mc':user_cate[uid]}})
		else:
            	    db.stat_journal_2012.update({'_id': uid}, {'$set': {'mc.11':user_cate[uid][11]}})
    finally:
        print 'newly appended user count is ',exists_count
        cursor.close()
Esempio n. 8
0
def export_csv(guids):

    csw = CommonCsvWriter(filename="./output/stat_HARDCORE-990-1")
    csw.write_header([u"aid", "guid", "guid_name", "chechin No", "view", "click", "share"])
    csv_body = []

    start_time = datetime.datetime(2012, 6, 14) - datetime.timedelta(hours=8)
    start = stat_util.convert_datetime_to_timestamp(start_time)
    end_time = datetime.datetime(2012, 11, 14) + datetime.timedelta(hours=16)
    end = stat_util.convert_datetime_to_timestamp(end_time)
    for guid in guids:
        _checkin_count = crab.location_post[guid_to_int(guid)].find(R.type.in_([1, 3, 7, 10])).count()
        _user_count = crab.location_post[guid_to_int(guid)].find(R.type.in_([1, 3, 7, 10])).group(R.user_id).count()
        _photo_count = crab.location_post[guid_to_int(guid)].find((R.has_photo) & R.type.in_([1, 3, 7, 10])).count()

        checkin_count = (
            crab.location_post[guid_to_int(guid)]
            .find(R.type.in_([1, 3, 7, 10]) & (R.created_on >= start) & (R.created_on < end))
            .count()
        )
        user_count = (
            crab.location_post[guid_to_int(guid)]
            .find(R.type.in_([1, 3, 7, 10]) & (R.created_on >= start) & (R.created_on < end))
            .group(R.user_id)
            .count()
        )
        photo_count = (
            crab.location_post[guid_to_int(guid)]
            .find(R.type.in_([1, 3, 7, 10]) & (R.has_photo) & (R.created_on >= start) & (R.created_on < end))
            .count()
        )
        # pid = crab.location_post[int(guid,16)].find(R.type.in_([1,3,7,10]))[0]['post_id']
        location = location_get(guid)
        if location:
            # row = [guid,location['name'],checkin_count,user_count,photo_count,' ',_checkin_count,_user_count,_photo_count]
            row = [
                guid,
                location["name"],
                _checkin_count,
                _user_count,
                _photo_count,
                " ",
                checkin_count,
                user_count,
                photo_count,
            ]
        print row
        csw.write_onerow(row)
    csw.end_write()

    csw = CommonCsvWriter(filename="./output/stat_HARDCORE-990-2")
    csw.write_header([u"aid", "guid", "guid_name", "chechin No", "view", "click", "share"])
    csv_body = []

    start_date = datetime.datetime(2012, 6, 14)
    end_date = datetime.datetime(2012, 11, 14)
    for i in range(0, (end_date - start_date).days + 1):
        _day = start_date + datetime.timedelta(days=i)
        start_time = _day - datetime.timedelta(hours=8)
        end_time = _day + datetime.timedelta(hours=16)
        start = stat_util.convert_datetime_to_timestamp(start_time)
        end = stat_util.convert_datetime_to_timestamp(end_time)
        checkin_count = 0
        user_count = 0
        photo_count = 0
        for guid in guids:
            checkin_count += (
                crab.location_post[guid_to_int(guid)]
                .find(R.type.in_([1, 3, 7, 10]) & (R.created_on >= start) & (R.created_on < end))
                .count()
            )
            user_count += (
                crab.location_post[guid_to_int(guid)]
                .find(R.type.in_([1, 3, 7, 10]) & (R.created_on >= start) & (R.created_on < end))
                .group(R.user_id)
                .count()
            )
            photo_count += (
                crab.location_post[guid_to_int(guid)]
                .find(R.type.in_([1, 3, 7, 10]) & (R.has_photo) & (R.created_on >= start) & (R.created_on < end))
                .count()
            )
        row = [_day, checkin_count, user_count, photo_count]
        print row
        csw.write_onerow(row)
    csw.end_write()