def get_followees(self, pid): url = 'http://www.weibo.com/p/' + pid + '/follow?from=page_' + pid[: 6] + '&wvr=6&mod=headfollow#place' while True: fetcher = self.fetchers[self.main_fetcher] html = open_url(fetcher, url) uid = self.parser.parse_uid(html) if uid == -1: self.ban_account() continue elif self.parser.is_visitor(html) is True: self.reset_account() continue fee_page_num = self.get_followee_page_num(html) if fee_page_num is not None: break else: log.warning('Cannot get followee page total number - pid:%s' % (pid, )) time.sleep( random.randint(Config.SLEEP_WHEN_EXCEPTION, 2 * Config.SLEEP_WHEN_EXCEPTION)) if fee_page_num == 0: print 'He/She does not follow any one.' return else: print 'Getting followee page 1 of %d...' % (fee_page_num, ) followees = self.parser.parse_followees(html, pid, datetime.now()) self.followee_list.extend( followees ) # followees cannot be None since it's been tested in self.get_followee_page_num(html)-> self.parser.parse_followee_page_num(html) if fee_page_num == 1: return for i in xrange(2, fee_page_num + 1): while True: url = 'http://www.weibo.com/p/%s/follow?from=page_%s&wvr=6&mod=headfollow&page=%d#place' % ( pid, pid[:6], i) print 'Getting followee page %d of %d...' % (i, fee_page_num) html = open_url(fetcher, url) time.sleep( random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2 * Config.SLEEP_BETWEEN_2FPAGES)) followees = self.parser.parse_followees( html, pid, datetime.now()) if followees is None: # dirty html log.warning( 'Cannot parse followee page correctly - pid:%s' % (pid, )) time.sleep( random.randint(Config.SLEEP_WHEN_EXCEPTION, 2 * Config.SLEEP_WHEN_EXCEPTION)) continue self.followee_list.extend(followees) break
def get_pid(self, uid): """ :param uid: :return: corresponding pid """ fetcher = self.fetchers[self.main_fetcher] url = 'http://www.weibo.com/u/%s' % (uid,) while True: html = open_url(fetcher, url) parsed_uid = self.parser.parse_uid(html) if parsed_uid == -1: self.ban_account() continue elif self.parser.is_visitor(html) is True: self.reset_account() continue # make sure that the html is correct. is_enterprise = self.parser.parse_is_enterprise(html) if is_enterprise is True: return -1 # -1 denotes this user is an enterprise pid = self.parser.parse_pid(html) if pid is not None: return pid else: log.error('Cannot get pid for uid:%s' % (uid,)) time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))
def get_pid(self, uid): """ :param uid: :return: corresponding pid """ fetcher = self.fetchers[self.main_fetcher] url = 'http://www.weibo.com/u/%s' % (uid, ) while True: html = open_url(fetcher, url) parsed_uid = self.parser.parse_uid(html) if parsed_uid == -1: self.ban_account() continue elif self.parser.is_visitor(html) is True: self.reset_account() continue # make sure that the html is correct. is_enterprise = self.parser.parse_is_enterprise(html) if is_enterprise is True: return -1 # -1 denotes this user is an enterprise pid = self.parser.parse_pid(html) if pid is not None: return pid else: log.error('Cannot get pid for uid:%s' % (uid, )) time.sleep( random.randint(Config.SLEEP_WHEN_EXCEPTION, 2 * Config.SLEEP_WHEN_EXCEPTION))
def get_followees(self, pid): url = 'http://www.weibo.com/p/' + pid + '/follow?from=page_' + pid[:6] + '&wvr=6&mod=headfollow#place' while True: fetcher = self.fetchers[self.main_fetcher] html = open_url(fetcher, url) uid = self.parser.parse_uid(html) if uid == -1: self.ban_account() continue elif self.parser.is_visitor(html) is True: self.reset_account() continue fee_page_num = self.get_followee_page_num(html) if fee_page_num is not None: break else: log.warning('Cannot get followee page total number - pid:%s' % (pid,)) time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION)) if fee_page_num == 0: print 'He/She does not follow any one.' return else: print 'Getting followee page 1 of %d...' % (fee_page_num,) followees = self.parser.parse_followees(html, pid, datetime.now()) self.followee_list.extend(followees) # followees cannot be None since it's been tested in self.get_followee_page_num(html)-> self.parser.parse_followee_page_num(html) if fee_page_num == 1: return for i in xrange(2, fee_page_num+1): while True: url = 'http://www.weibo.com/p/%s/follow?from=page_%s&wvr=6&mod=headfollow&page=%d#place' % (pid, pid[:6], i) print 'Getting followee page %d of %d...' % (i, fee_page_num) html = open_url(fetcher, url) time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES)) followees = self.parser.parse_followees(html, pid, datetime.now()) if followees is None: # dirty html log.warning('Cannot parse followee page correctly - pid:%s' % (pid,)) time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION)) continue self.followee_list.extend(followees) break
def get_timeline_page_num(self, uid): """ :param uid: :return: page number and one or two pages, which will decrease accesses to Sina server """ htmls = [] # keep the pages to decrease accesses to Sina while True: first_page_head = self.fetch_timelines_by_page_bar(uid, 1, 0) if first_page_head is None: # no any posts return 0, htmls else: htmls.append(first_page_head) time.sleep( random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2 * Config.SLEEP_BETWEEN_2FPAGES)) first_page_body = self.fetch_timelines_by_page_bar(uid, 1, 1) if first_page_body is None: return 1, htmls else: htmls.append(first_page_body) time.sleep( random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2 * Config.SLEEP_BETWEEN_2FPAGES)) first_page_tail = self.fetch_timelines_by_page_bar(uid, 1, 2) if first_page_tail is None: # just one page of timelines return 1, htmls else: htmls.append(first_page_tail) time.sleep( random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2 * Config.SLEEP_BETWEEN_2FPAGES)) pnum = self.parser.parse_timeline_page_num( first_page_tail ) # this page number is not accurate, so we will recount it in the next step if pnum is None or pnum == 1: return 1, htmls while True: url = 'http://www.weibo.com/%s?page=%d&pids=Pl_Content_HomeFeed' % ( uid, pnum) test_html = open_url(self.fetchers[self.main_fetcher], url) time.sleep( random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2 * Config.SLEEP_BETWEEN_2FPAGES)) no_post = 'W_icon icon_warnB' if no_post in test_html: pnum -= 1 # fixing page number else: break return pnum, htmls
def get_profile(self, pid): ''' get profile information for User marked with pid :param pid: page id :return: ''' url = 'http://www.weibo.com/p/%s/info?mod=pedit_more' % (pid, ) uid = pid[6:] is_taobao = None while is_taobao is None: try: is_taobao = self.is_taobao( uid) # get taobao information in advance if is_taobao == -1: self.ban_account() is_taobao = None elif is_taobao == -2: self.reset_account() is_taobao = None except Exception as e: print e.message time.sleep( random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2 * Config.SLEEP_BETWEEN_2FPAGES)) profile = None print 'Getting profile page...' while profile is None: fetcher = self.fetchers[self.main_fetcher] html = open_url(fetcher, url) if self.parser.parse_uid( html ) == -1: # if -1 that means this working account maybe be banned self.ban_account() continue elif self.parser.is_visitor( html ) is True: # judge whether working account falls into visitor status self.reset_account() continue profile = self.parser.parse_profile(html, pid, is_taobao, datetime.now()) time.sleep( random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2 * Config.SLEEP_BETWEEN_2FPAGES)) self.profile_list.append(profile)
def get_timeline_page_num(self, uid): """ :param uid: :return: page number and one or two pages, which will decrease accesses to Sina server """ htmls = [] # keep the pages to decrease accesses to Sina while True: first_page_head = self.fetch_timelines_by_page_bar(uid, 1, 0) if first_page_head is None: # no any posts return 0, htmls else: htmls.append(first_page_head) time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES)) first_page_body = self.fetch_timelines_by_page_bar(uid, 1, 1) if first_page_body is None: return 1, htmls else: htmls.append(first_page_body) time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES)) first_page_tail = self.fetch_timelines_by_page_bar(uid, 1, 2) if first_page_tail is None: # just one page of timelines return 1, htmls else: htmls.append(first_page_tail) time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES)) pnum = self.parser.parse_timeline_page_num(first_page_tail) # this page number is not accurate, so we will recount it in the next step if pnum is None or pnum == 1: return 1, htmls while True: url = 'http://www.weibo.com/%s?page=%d&pids=Pl_Content_HomeFeed' % (uid, pnum) test_html = open_url(self.fetchers[self.main_fetcher], url) time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES)) no_post = 'W_icon icon_warnB' if no_post in test_html: pnum -= 1 # fixing page number else: break return pnum, htmls
def is_taobao(self, uid): ''' :param uid: user ID :return: a boolean value ('1' or '0') indicating whether user is a taobao shopkeeper ''' fetcher = self.fetchers[self.main_fetcher] url = 'http://www.weibo.com/u/' + uid html = open_url(fetcher, url) with open('debug_taobao.txt', 'w') as writer: writer.write(html) if self.parser.parse_uid(html) == -1: return -1 # account is banned elif self.parser.is_visitor(html) is True: return -2 # account is in visitor status return self.parser.parse_is_taobao(html)
def get_profile(self, pid): ''' get profile information for User marked with pid :param pid: page id :return: ''' url = 'http://www.weibo.com/p/%s/info?mod=pedit_more' % (pid,) uid = pid[6:] is_taobao = None while is_taobao is None: try: is_taobao = self.is_taobao(uid) # get taobao information in advance if is_taobao == -1: self.ban_account() is_taobao = None elif is_taobao == -2: self.reset_account() is_taobao = None except Exception as e: print e.message time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES)) profile = None print 'Getting profile page...' while profile is None: fetcher = self.fetchers[self.main_fetcher] html = open_url(fetcher, url) if self.parser.parse_uid(html) == -1: # if -1 that means this working account maybe be banned self.ban_account() continue elif self.parser.is_visitor(html) is True: # judge whether working account falls into visitor status self.reset_account() continue profile = self.parser.parse_profile(html, pid, is_taobao, datetime.now()) time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES)) self.profile_list.append(profile)
def ban_account(self): url = 'http://sass.weibo.com/unfreeze' html = open_url(self.fetchers[self.main_fetcher], url) is_exceptional = self.parser.is_exceptional(html) is_frozen = self.parser.is_frozen(html) if is_exceptional is False and is_frozen is False: return account = self.users[self.main_fetcher].acct Dao.Account.ban(account) emphasis_print('One account has been banned!!!') self.users.pop(self.main_fetcher) self.fetchers.pop(self.main_fetcher) if self.main_fetcher == len(self.fetchers): self.main_fetcher = 0 if len(self.fetchers) == 0: raise Exception('No valid account!')
def get_server_data(self, opener): print "Getting server time and nonce..." server_data = open_url(opener, self.server_url) # get sever_data return self.parse_server_data(server_data)
def fetch_timelines_by_page_bar(self, uid, pnum, bnum): """ fetch timelines by specifying page number and bar number :param uid: :param pnum: page number :param bnum: bar number :return: html containing timelines or None if there are no timelines """ body = { # 这个是有抓包得出的,因为新浪微博用了瀑布流动态加载,所以不能一次性得到一页中所有信息 '__rnd':1343647638078, '_k':1343647471134109, '_t':0, 'count':15, 'end_id':3473519214542343, 'max_id':3473279479126179, 'page':1, 'pagebar':1, 'pre_page':1, 'uid':uid } body['page'] = pnum if bnum == 0: body['count'] = '50' body['pagebar'] = '' body['pre_page'] = pnum-1 elif bnum == 1: body['count'] = '15' body['pagebar'] = '0' body['pre_page'] = pnum elif bnum == 2: body['count'] = '15' body['pagebar'] = '1' body['pre_page'] = pnum url = 'http://weibo.com/aj/mblog/mbloglist?' + urllib.urlencode(body) while True: try: print 'Getting timeline page %d part %d...' % (pnum, bnum+1) # bnum starts with zero up to two jsn_data = open_url(self.fetchers[self.main_fetcher], url) if self.parser.is_frozen(jsn_data): self.ban_account() continue data = json.loads(jsn_data) html = data['data'] if u'WB_feed_type SW_fun S_line2' in html: return html else: return None except Exception as e: if 'No valid account!' in e.message: raise e if 'No JSON object could be decoded' in e.message: if self.parser.is_visitor(jsn_data) is True: self.reset_account() else: self.ban_account() log.warning(e.message) time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION)) continue
def fetch_timelines_by_page_bar(self, uid, pnum, bnum): """ fetch timelines by specifying page number and bar number :param uid: :param pnum: page number :param bnum: bar number :return: html containing timelines or None if there are no timelines """ body = { # 这个是有抓包得出的,因为新浪微博用了瀑布流动态加载,所以不能一次性得到一页中所有信息 '__rnd': 1343647638078, '_k': 1343647471134109, '_t': 0, 'count': 15, 'end_id': 3473519214542343, 'max_id': 3473279479126179, 'page': 1, 'pagebar': 1, 'pre_page': 1, 'uid': uid } body['page'] = pnum if bnum == 0: body['count'] = '50' body['pagebar'] = '' body['pre_page'] = pnum - 1 elif bnum == 1: body['count'] = '15' body['pagebar'] = '0' body['pre_page'] = pnum elif bnum == 2: body['count'] = '15' body['pagebar'] = '1' body['pre_page'] = pnum url = 'http://weibo.com/aj/mblog/mbloglist?' + urllib.urlencode(body) while True: try: print 'Getting timeline page %d part %d...' % ( pnum, bnum + 1) # bnum starts with zero up to two jsn_data = open_url(self.fetchers[self.main_fetcher], url) if self.parser.is_frozen(jsn_data): self.ban_account() continue data = json.loads(jsn_data) html = data['data'] if u'WB_feed_type SW_fun S_line2' in html: return html else: return None except Exception as e: if 'No valid account!' in e.message: raise e if 'No JSON object could be decoded' in e.message: if self.parser.is_visitor(jsn_data) is True: self.reset_account() else: self.ban_account() log.warning(e.message) time.sleep( random.randint(Config.SLEEP_WHEN_EXCEPTION, 2 * Config.SLEEP_WHEN_EXCEPTION)) continue
#-*- coding: UTF-8 -*- __author__ = 'chzhu' import Spider from Spider import Spider as SP from Dao import Task, Account if __name__ == '__main__': # user_1 = Spider.User('*****@*****.**', 'a123456') # user_2 = Spider.User('*****@*****.**', 'w1127401044') # user_list = [user_1, user_2] # # sp = SP(user_list) # # uid_list = Task.get_all() # # for uid in uid_list: # sp.collect_user_profiles_only(uid) # sp.save_only_profile() user = Spider.User('*****@*****.**', 'pp9999') user_list = [user] sp = SP(user_list) from Utility import open_url uid = '2971804112' url = 'http://www.weibo.com/aj/v6/user/newcard?ajwvr=6&id=%s&type=0&refer_flag=0000011002_&callback=STK_145465715259821' r = open_url(sp.fetchers[sp.main_fetcher], url%uid) print 'dd'