Exemple #1
0
 def get_fetchers_by_user(self):
     """
     initialize self.fetchers by user
     :return:
     """
     wb = Weibo()
     for user in self.users:
         fetcher = wb.login(user)
         if fetcher is not None:
             emphasis_print('User: %s login success!' % (user.acct,))
             self.fetchers.append(fetcher)
         else:
             emphasis_print('User: %s login failure!' % (user.acct,))
     print 'Get all fetchers by users!\n'
Exemple #2
0
    def reset_account(self):

        account = self.users[self.main_fetcher]
        accounts = []
        accounts.append(account)
        Dao.Account.reset(accounts)
        emphasis_print('One account expires!!!')

        self.users.pop(self.main_fetcher)
        self.fetchers.pop(self.main_fetcher)
        if self.main_fetcher == len(self.fetchers):
            self.main_fetcher = 0

        if len(self.fetchers) == 0:
            raise Exception('No valid account!')
Exemple #3
0
    def ban_account(self):

        url = 'http://sass.weibo.com/unfreeze'
        html = open_url(self.fetchers[self.main_fetcher], url)
        is_exceptional = self.parser.is_exceptional(html)
        is_frozen = self.parser.is_frozen(html)
        if is_exceptional is False and is_frozen is False:
            return

        account = self.users[self.main_fetcher].acct
        Dao.Account.ban(account)
        emphasis_print('One account has been banned!!!')

        self.users.pop(self.main_fetcher)
        self.fetchers.pop(self.main_fetcher)
        if self.main_fetcher == len(self.fetchers):
            self.main_fetcher = 0

        if len(self.fetchers) == 0:
            raise Exception('No valid account!')
Exemple #4
0
    def get_timelines(self, uid):
        """
        get all timelines of user with this uid
        :param uid:
        :return:
        """
        fetcher = self.fetchers[self.main_fetcher]

        timeline_page_num, first_page = self.get_timeline_page_num(uid)
        if timeline_page_num == 0:
            print 'No any posts.'
            return
        else:
            for pt in first_page:
                self.timeline_list.extend(self.parser.parse_timelines(pt, uid, datetime.now()))
            if timeline_page_num == 1:
                print 'He/She just has one page timeline.'
                return

        timelines = []
        for pnum in xrange(2, timeline_page_num+1):
            print 'There are totally %d timeline pages.' % (timeline_page_num,)
            for bnum in xrange(3):
                html = self.fetch_timelines_by_page_bar(uid, pnum, bnum)
                time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))
                if html is not None:
                    timelines = self.parser.parse_timelines(html, uid, datetime.now())
                    self.timeline_list.extend(timelines)
            self.end_time = datetime.now()
            duration = self.end_time - self.start_time
            if duration.seconds > Config.ACCOUNT_CHANGE_TIME:
                self.main_fetcher = loop_increase(self.main_fetcher, len(self.fetchers))
                self.start_time = datetime.now()
                emphasis_print('Account changed!!!')
                emphasis_print('Now %d of %d accounts are working!' % (self.main_fetcher+1, len(self.fetchers)))
            time.sleep(random.randint(Config.SLEEP_BETWEEN_TIMELINE_PAGES, 2*Config.SLEEP_BETWEEN_TIMELINE_PAGES))
        if uid not in crawled_list:
            uncrawled_list.append(uid)
    Task.reset(uncrawled_list)



if __name__ == '__main__':

    print 'Initializing...'
    crawled_list = []
    spider, uid_list, user_list = initialization()

    try:
        while True:
            for uid in uid_list:
                emphasis_print('Now %d of %d accounts are working!' % (spider.main_fetcher+1, len(spider.fetchers)))
                retcode = spider.collect_user_information(uid)
                if retcode == 404 or retcode == -1:
                    continue

                while True: # in case of connection lost
                    try:
                        spider.save()
                        break
                    except Exception as e:
                        print e.message, uid
                        if 'Lost connection to MySQL server during query' in e.message:
                            continue
                        else:
                            break