def get_fetchers_by_user(self): """ initialize self.fetchers by user :return: """ wb = Weibo() for user in self.users: fetcher = wb.login(user) if fetcher is not None: emphasis_print('User: %s login success!' % (user.acct,)) self.fetchers.append(fetcher) else: emphasis_print('User: %s login failure!' % (user.acct,)) print 'Get all fetchers by users!\n'
def get_fetchers_by_user(self): """ initialize self.fetchers by user :return: """ wb = Weibo() for user in self.users: fetcher = wb.login(user) if fetcher is not None: emphasis_print('User: %s login success!' % (user.acct, )) self.fetchers.append(fetcher) else: emphasis_print('User: %s login failure!' % (user.acct, )) print 'Get all fetchers by users!\n'
def reset_account(self): account = self.users[self.main_fetcher] accounts = [] accounts.append(account) Dao.Account.reset(accounts) emphasis_print('One account expires!!!') self.users.pop(self.main_fetcher) self.fetchers.pop(self.main_fetcher) if self.main_fetcher == len(self.fetchers): self.main_fetcher = 0 if len(self.fetchers) == 0: raise Exception('No valid account!')
def get_timelines(self, uid): """ get all timelines of user with this uid :param uid: :return: """ fetcher = self.fetchers[self.main_fetcher] timeline_page_num, first_page = self.get_timeline_page_num(uid) if timeline_page_num == 0: print 'No any posts.' return else: for pt in first_page: self.timeline_list.extend( self.parser.parse_timelines(pt, uid, datetime.now())) if timeline_page_num == 1: print 'He/She just has one page timeline.' return timelines = [] for pnum in xrange(2, timeline_page_num + 1): print 'There are totally %d timeline pages.' % ( timeline_page_num, ) for bnum in xrange(3): html = self.fetch_timelines_by_page_bar(uid, pnum, bnum) time.sleep( random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2 * Config.SLEEP_BETWEEN_2FPAGES)) if html is not None: timelines = self.parser.parse_timelines( html, uid, datetime.now()) self.timeline_list.extend(timelines) self.end_time = datetime.now() duration = self.end_time - self.start_time if duration.seconds > Config.ACCOUNT_CHANGE_TIME: self.main_fetcher = loop_increase(self.main_fetcher, len(self.fetchers)) self.start_time = datetime.now() emphasis_print('Account changed!!!') emphasis_print('Now %d of %d accounts are working!' % (self.main_fetcher + 1, len(self.fetchers))) time.sleep( random.randint(Config.SLEEP_BETWEEN_TIMELINE_PAGES, 2 * Config.SLEEP_BETWEEN_TIMELINE_PAGES))
def ban_account(self): url = 'http://sass.weibo.com/unfreeze' html = open_url(self.fetchers[self.main_fetcher], url) is_exceptional = self.parser.is_exceptional(html) is_frozen = self.parser.is_frozen(html) if is_exceptional is False and is_frozen is False: return account = self.users[self.main_fetcher].acct Dao.Account.ban(account) emphasis_print('One account has been banned!!!') self.users.pop(self.main_fetcher) self.fetchers.pop(self.main_fetcher) if self.main_fetcher == len(self.fetchers): self.main_fetcher = 0 if len(self.fetchers) == 0: raise Exception('No valid account!')
def get_timelines(self, uid): """ get all timelines of user with this uid :param uid: :return: """ fetcher = self.fetchers[self.main_fetcher] timeline_page_num, first_page = self.get_timeline_page_num(uid) if timeline_page_num == 0: print 'No any posts.' return else: for pt in first_page: self.timeline_list.extend(self.parser.parse_timelines(pt, uid, datetime.now())) if timeline_page_num == 1: print 'He/She just has one page timeline.' return timelines = [] for pnum in xrange(2, timeline_page_num+1): print 'There are totally %d timeline pages.' % (timeline_page_num,) for bnum in xrange(3): html = self.fetch_timelines_by_page_bar(uid, pnum, bnum) time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES)) if html is not None: timelines = self.parser.parse_timelines(html, uid, datetime.now()) self.timeline_list.extend(timelines) self.end_time = datetime.now() duration = self.end_time - self.start_time if duration.seconds > Config.ACCOUNT_CHANGE_TIME: self.main_fetcher = loop_increase(self.main_fetcher, len(self.fetchers)) self.start_time = datetime.now() emphasis_print('Account changed!!!') emphasis_print('Now %d of %d accounts are working!' % (self.main_fetcher+1, len(self.fetchers))) time.sleep(random.randint(Config.SLEEP_BETWEEN_TIMELINE_PAGES, 2*Config.SLEEP_BETWEEN_TIMELINE_PAGES))
for uid in uid_list: if uid not in crawled_list: uncrawled_list.append(uid) Task.reset(uncrawled_list) if __name__ == '__main__': print 'Initializing...' crawled_list = [] spider, uid_list, user_list = initialization() try: while True: for uid in uid_list: emphasis_print('Now %d of %d accounts are working!' % (spider.main_fetcher + 1, len(spider.fetchers))) retcode = spider.collect_user_information(uid) if retcode == 404 or retcode == -1: continue while True: # in case of connection lost try: spider.save() break except Exception as e: print e.message, uid if 'Lost connection to MySQL server during query' in e.message: continue else: break
if uid not in crawled_list: uncrawled_list.append(uid) Task.reset(uncrawled_list) if __name__ == '__main__': print 'Initializing...' crawled_list = [] spider, uid_list, user_list = initialization() try: while True: for uid in uid_list: emphasis_print('Now %d of %d accounts are working!' % (spider.main_fetcher+1, len(spider.fetchers))) retcode = spider.collect_user_information(uid) if retcode == 404 or retcode == -1: continue while True: # in case of connection lost try: spider.save() break except Exception as e: print e.message, uid if 'Lost connection to MySQL server during query' in e.message: continue else: break