def fetch_uids_from_weibo_cn(scheduler): # Params. global my_uid follower_url = 'http://weibo.cn/{}/fans?rightmod=1&wvr=6'.format(my_uid) page_num = 101 find_next_retry = 3 uids, persist_thresh = [], 100 url = follower_url for i in xrange(page_num): with get_pager() as pager: pager.get(url) try: # Remember the following function also update 'st' token which # would be used to to remove followers. uids_in_page = get_follower_uids_in_a_page(pager) if uids_in_page: uids.extend(uids_in_page) if len(uids) >= persist_thresh: Follower.save_uids(uids) logging.info('Persisted %d uids' % len(uids)) uids = [] except RemoveZombieException as e: logging.warn(unicode(e)) for _ in range(find_next_retry): try: links = pager.find_elements_by_link_text(u'下页') if not links: raise Exception('Could\'nt find next page link') next_page = links[0] # Set next page URL. url = next_page.get_attribute('href') except Exception as e: logging.warn(unicode(e)) logging.info('Retry after a second') time.sleep(1) else: logging.info('Go to next page: %d' % (i + 1)) break else: msg = 'Failed to find next page for %d times, exit' \ % find_next_retry logging.warn(msg) break # Persist remaining list of UIDs. if uids: Follower.save_uids(uids) logging.info('Persisted %d uids' % len(uids)) # Schedule next task. logging.info('Schedule next fetching') scheduler.enter(SCHEDULE_INTERVAL, 1, fetch_uids_from_weibo_cn, (scheduler, ))
def fetch_uids_from_weibo_cn(scheduler): # Params. global my_uid follower_url = 'http://weibo.cn/{}/fans?rightmod=1&wvr=6'.format(my_uid) page_num = 101 find_next_retry = 3 uids, persist_thresh = [], 100 url = follower_url for i in xrange(page_num): with get_pager() as pager: pager.get(url) try: # Remember the following function also update 'st' token which # would be used to to remove followers. uids_in_page = get_follower_uids_in_a_page(pager) if uids_in_page: uids.extend(uids_in_page) if len(uids) >= persist_thresh: Follower.save_uids(uids) logging.info('Persisted %d uids' % len(uids)) uids = [] except RemoveZombieException as e: logging.warn(unicode(e)) for _ in range(find_next_retry): try: links = pager.find_elements_by_link_text(u'下页') if not links: raise Exception('Could\'nt find next page link') next_page = links[0] # Set next page URL. url = next_page.get_attribute('href') except Exception as e: logging.warn(unicode(e)) logging.info('Retry after a second') time.sleep(1) else: logging.info('Go to next page: %d' % (i + 1)) break else: msg = 'Failed to find next page for %d times, exit' \ % find_next_retry logging.warn(msg) break # Persist remaining list of UIDs. if uids: Follower.save_uids(uids) logging.info('Persisted %d uids' % len(uids)) # Schedule next task. logging.info('Schedule next fetching') scheduler.enter( SCHEDULE_INTERVAL, 1, fetch_uids_from_weibo_cn, (scheduler,))
def fetch_follower_info(scheduler): uids = Follower.get_unfilled_uids() info = namedtuple('Info', ['uid', 'weibo_count', 'follower_count']) if uids: follower_info_list, persist_thresh = [], 100 for i in range(0, len(uids), CONCURRENT_CONN): sub_uid_list = uids[i:i + CONCURRENT_CONN] concurrent_reqs = [ grequests.get(info_url(uid), headers=HEADERS) for uid in sub_uid_list ] resp_list = grequests.map(concurrent_reqs) try: for uid, resp in zip(sub_uid_list, resp_list): matches = re.findall( u'%s.*%s' % (WEIBO_PATTERN, FOLLOWER_PATTERN), resp.text) if matches: weibo_cnt, follower_cnt = map(int, matches[0]) follower_info_list.append(info( uid, weibo_cnt, follower_cnt)) logging.info( '%s Weibo Count: %d, Follower Count: %d' % ( uid, weibo_cnt, follower_cnt)) if len(follower_info_list) > persist_thresh: Follower.save_follower_info(follower_info_list) logging.info('Persisted %d follower info entries' % len(follower_info_list)) follower_info_list = [] else: logging.warn('Failed to get info of user %s' % uid) except Exception: logging.exception('Exception for retrieving user info') continue # Persist remaining list of info. if follower_info_list: Follower.save_follower_info(follower_info_list) logging.info('Persisted %d follower info entries' % len(follower_info_list)) else: logging.error('No UIDs provided. Wait for next time') # Schedule next task. logging.info('Schedule next fetching') scheduler.enter( SCHEDULE_INTERVAL, 1, fetch_follower_info, (scheduler,))
def add_follower(user, follower): exists = Follower.query.filter(Follower.user_id == user, Follower.follower_id == follower).first() if exists is None: follower = Follower(follower_id=follower, user_id=user) db.session.add(follower) db.session.commit()
def follow_user(user_id): """Follow the user identified by `user_id`.""" follower_id = request.form.get('uid') # uid: the user making the request if not follower_id: abort(400) # Bad request session_id = session_check(session['id'], request.remote_addr, request.user_agent, int(follower_id)) if session_id == '': session['id'] = '' abort(401) # Unauthorized else: session['id'] = session_id if User.query.filter(User.id == user_id).one().deleted: abort(403, 'Cannot follow a deleted user.') elif Follower.query.filter(Follower.user_id == user_id, Follower.follower_id == follower_id).first(): abort(403, 'User already followed.') else: new_follow = Follower(user_id=user_id, follower_id=follower_id) db.session.add(new_follow) db.session.commit() return ('', 204) # status 204: success, no content
def kill_zombies(scheduler): st = get_st() uids = Follower.get_zombie_uids(limit=ZOMBIE_KILL_LIMIT) if uids and st: logging.info('Try to delete %d zombie followers' % len(uids)) for i in range(0, len(uids), CONCURRENT_CONN): sub_uid_list = uids[i:i + CONCURRENT_CONN] concurrent_reqs = [ grequests.get( remove_url(uid, st), headers=HEADERS, allow_redirects=False) for uid in sub_uid_list ] resp_list = grequests.map(concurrent_reqs) deleted_uids = [] for uid, resp in zip(sub_uid_list, resp_list): if not resp: logging.warn('None response, ignore') if resp.status_code == 302: deleted_uids.append(uid) logging.info('Deleted %s' % uid) else: logging.warn('Failed to delete %s because of %s' % (uid, resp.status_code)) # Record back to DB about deleted UIDs. if deleted_uids: Follower.confirm_uid_deleted(deleted_uids) logging.info('Confirmed deletion of %d uids' % len(deleted_uids)) else: logging.error( 'Failed to necessary data, st: %s, uid length: %d' % (st, len(uids))) # Schedule next task. logging.info('Schedule next fetching') scheduler.enter( SCHEDULE_INTERVAL, 1, kill_zombies, (scheduler,))
def kill_zombies(scheduler): st = get_st() uids = Follower.get_zombie_uids(limit=ZOMBIE_KILL_LIMIT) if uids and st: logging.info('Try to delete %d zombie followers' % len(uids)) for i in range(0, len(uids), CONCURRENT_CONN): sub_uid_list = uids[i:i + CONCURRENT_CONN] concurrent_reqs = [ grequests.get(remove_url(uid, st), headers=HEADERS, allow_redirects=False) for uid in sub_uid_list ] resp_list = grequests.map(concurrent_reqs) deleted_uids = [] for uid, resp in zip(sub_uid_list, resp_list): if not resp: logging.warn('None response, ignore') if resp.status_code == 302: deleted_uids.append(uid) logging.info('Deleted %s' % uid) else: logging.warn('Failed to delete %s because of %s' % (uid, resp.status_code)) # Record back to DB about deleted UIDs. if deleted_uids: Follower.confirm_uid_deleted(deleted_uids) logging.info('Confirmed deletion of %d uids' % len(deleted_uids)) else: logging.error('Failed to necessary data, st: %s, uid length: %d' % (st, len(uids))) # Schedule next task. logging.info('Schedule next fetching') scheduler.enter(SCHEDULE_INTERVAL, 1, kill_zombies, (scheduler, ))
def db_test_data(): """Create sample data for test database.""" # Add sample users u1 = User(uname='lemongrab') u2 = User(uname='bubblegum') u3 = User(uname='marceline') u4 = User(uname='simon') # Add sample posts p1 = Post(title="One million years dungeon", content="One million years dungeon", user=u1, created=datetime.utcnow()) p2 = Post(title="Poor Lemongrab", content="You try your best", user=u2, references=[p1], created=datetime.utcnow()) p3 = Post(title="Candy Kingdom", content="It's my favorite kingdom!", user=u2, created=datetime.utcnow()) db.session.add_all([u1, u2, u3, u4, p1, p2, p3]) db.session.commit() # Add bookmarks b1 = Bookmark(user=u3, post_id=p1.id) # Add followers f1 = Follower(user_id=u1.id, follower_id=u2.id) f2 = Follower(user_id=u2.id, follower_id=u3.id) f3 = Follower(user_id=u2.id, follower_id=u4.id) db.session.add_all([b1, f1, f2, f3]) db.session.commit()
def follower(): """Add followers to a user, display user followers.""" if request.method == "GET": pass if request.method == "POST": current_user = request.form.get( 'follower') # (user in current session) followed = request.form.get( 'follow') #user_id (who you want to follow) followed_user = request.form.get('followed-user') new_follower = Follower(user_id=followed, follower=current_user) db.session.add(new_follower) db.session.commit() return redirect('/profile/' + str(followed_user))
def example_data(): """Create example data for the test database.""" Dislike.query.delete() Stargazer.query.delete() Watcher.query.delete() Follower.query.delete() Contributor.query.delete() RepoLanguage.query.delete() Language.query.delete() Repo.query.delete() Account.query.delete() User.query.delete() jane = User(user_id="1", login="******", name="Jane", last_crawled=datetime.datetime.now(), last_crawled_depth=2) alex = User(user_id="2", login="******", name="Alex", last_crawled=(datetime.datetime.now() - datetime.timedelta(weeks=6)), last_crawled_depth=2) kelly = User(user_id="3", login="******", name="Kelly") db.session.add_all([jane, alex, kelly]) db.session.commit() jane_account = Account(user_id="1", access_token="abc123") db.session.add(jane_account) db.session.commit() py_repo = Repo(repo_id="1", name="python-repo", description="A Python repository", owner_id="1", last_crawled=datetime.datetime.now(), last_crawled_depth=2, url="https://github.com/jhacks/python-repo", stargazers_count=2) js_repo = Repo(repo_id="2", name="js-repo", description="A Javascript repository", owner_id="1", last_crawled=(datetime.datetime.now() - datetime.timedelta(weeks=6)), last_crawled_depth=1, url="https://github.com/jhacks/js-repo", stargazers_count=1) db.session.add_all([py_repo, js_repo]) db.session.commit() astar = Stargazer(repo_id="1", user_id="2") kstar = Stargazer(repo_id="1", user_id="3") kstar_js = Stargazer(repo_id="2", user_id="3") a_dislike_js = Dislike(repo_id="2", user_id="2") # k_dislike_js = Dislike(repo_id="2", user_id="3") db.session.add_all([astar, kstar, kstar_js, a_dislike_js]) db.session.commit() kwatch = Watcher(repo_id="1", user_id="3") a_j_follow = Follower(user_id="1", follower_id="2") k_j_follow = Follower(user_id="1", follower_id="3") j_a_follow = Follower(user_id="2", follower_id="1") db.session.add_all([kwatch, a_j_follow, k_j_follow, j_a_follow]) db.session.commit() jcon = Contributor(repo_id="1", user_id="1") kcon = Contributor(repo_id="1", user_id="3") db.session.add_all([jcon, kcon]) db.session.commit() # python = Topic(topic_id="1", topic_name="python") # api = Topic(topic_id="2", topic_name="api") # db.session.add_all([python, api]) # db.session.commit() # py_rep1 = RepoTopic(topic_id="1", repo_id="1") # api_rep1 = RepoTopic(topic_id="2", repo_id="1") # db.session.add_all([py_rep1, api_rep1]) # db.session.commit() py_lang = Language(language_id="1", language_name="python") c_lang = Language(language_id="2", language_name="c") db.session.add_all([py_lang, c_lang]) db.session.commit() py_lang_rep1 = RepoLanguage(language_id="1", repo_id="1", language_bytes=5000) c_lang_rep1 = RepoLanguage(language_id="2", repo_id="1", language_bytes=100) db.session.add_all([py_lang_rep1, c_lang_rep1]) db.session.commit()
# coding=utf-8 import logging import pickle import sched import grequests import time from model import Follower if not Follower.table_exists(): Follower.create_table() FORMAT = '%(asctime)-15s %(message)s' logging.basicConfig(filename='log/killer.log', level=logging.INFO, format=FORMAT) # Suppress other logging. for k in logging.Logger.manager.loggerDict: logging.getLogger(k).setLevel(logging.WARNING) # Load cookies. cookies = pickle.load(open('data/cookies.pkl', 'rb')) cookies_str = ';'.join('%s=%s' % (name, val) for name, val in cookies.items()) remove_url = 'http://weibo.cn/attention/remove?act=removec&uid={}&st={}'.format HEADERS = {'Cookie': cookies_str} SCHEDULE_INTERVAL = 60 # 1 min. ZOMBIE_KILL_LIMIT = 5000 CONCURRENT_CONN = 15
# coding=utf-8 import logging import pickle import re import sched from collections import namedtuple import grequests import time from model import Follower if not Follower.table_exists(): Follower.create_table() FORMAT = '%(asctime)-15s %(message)s' logging.basicConfig( filename='log/info_fetcher.log', level=logging.INFO, format=FORMAT) # Suppress other logging. for k in logging.Logger.manager.loggerDict: logging.getLogger(k).setLevel(logging.WARNING) # Load cookies. cookies = pickle.load(open('data/cookies.pkl', 'rb')) cookies_str = ';'.join('%s=%s' % (name, val) for name, val in cookies.items()) info_url = 'http://weibo.cn/u/{}'.format HEADERS = { 'Cookie': cookies_str } WEIBO_PATTERN = u'微博\[(\d+)\]'