Example #1
0
class Controller(object):
    """docstring for Controller"""
    def __init__(self, movie_name):
        super(Controller, self).__init__()
        self.session = login()
        self.movie_name = movie_name
        self.movie = Movie(self.movie_name, self.session)

    def start(self):
        start_time = datetime.datetime.strptime(START_TIME, '%Y-%m-%d-%H')
        end_time = datetime.datetime.strptime(END_TIME, '%Y-%m-%d-%H')
        if start_time < end_time:
            while True:
                time.sleep(3)
                cursor_time = start_time + datetime.timedelta(hours=1)
                timescope = datetime.datetime.strftime(start_time, '%Y-%m-%d-%H') +':' + datetime.datetime.strftime(cursor_time, '%Y-%m-%d-%H')
                url = 'http://s.weibo.com/weibo/'+ quote(self.movie_name) +'&timescope=custom:'+ quote(timescope)
                response = self.session.get(url)
                count = self.get_page_count(response.content)
                self.handle_one_page(url, count, self.movie.id)
                start_time = cursor_time
                if start_time > end_time:
                    break
        else:
            print 'ERROR:Start time must early than end time!'

    def get_page_count(self, content):
        max_count = 0
        result = re.findall(r'&page=\d+', content)
        for index in result:
            info = index.split('=')
            if int(info[1]) > max_count:
                max_count = int(info[1])
        print 'max_page:', max_count
        return max_count

    def handle_one_page(self, url_head, page_count, movie_id):
        for i in xrange(1, page_count + 1):
            page = str(i)
            url = url_head +"&page=" + page
            response = self.session.get(url)
            self.movie.decode_content(response.content, movie_id)
            print "现在是第%s页" % page
            time.sleep(3)
Example #2
0
class Controller(object):
    """docstring for Controller"""
    def __init__(self, movie_name):
        super(Controller, self).__init__()
        self.session = login()
        self.movie_name = movie_name
        self.movie = Movie(self.movie_name, self.session)
        #self.url = 'http://s.weibo.com/weibo/'
        self.url = 'http://s.weibo.com/wb/'
        self.page_count = 0

    def start(self):
        start_time = datetime.datetime.strptime(START_DAY, '%Y-%m-%d')
        end_time = datetime.datetime.strptime(END_DAY, '%Y-%m-%d')
        if start_time <= end_time:
            print 'Name:', self.movie_name
            print 'Time:', START_DAY,'--',END_DAY
            self.search_nomarl(start_time, end_time)
        else:
            print 'ERROR:起始时间必须比结束时间早!'

    def search_nomarl(self, start_time, end_time):
        '''普通搜索,All时间'''
        print 'search_nomarl:::'
        url = self.url+ quote(self.movie_name)
        start_time = datetime.datetime.strftime(start_time, '%Y-%m-%d')
        end_time = datetime.datetime.strftime(end_time, '%Y-%m-%d')
        while True:
            timescope = start_time +':' + end_time
            url = self.url+ quote(self.movie_name) +'&timescope=custom:'+ quote(timescope)+ '&xsort=time&nodup=1'
            print url
            page_content = self.session.get(url).content
            count = self.get_page_count(page_content)

            if count == 0:
                if self.is_rebot(page_content):
                    print '变机器人了,需要帮助, sleep %ss' % REBOT_SLEEP_TIME
                    time.sleep(REBOT_SLEEP_TIME)
                    continue
                else:
                    print '数据0页~~!'
                    return self.handle_pages(url, count, self.movie.id, start_time, end_time)
            elif count < 50:
                print '数据小于50页,直接爬取'
                return self.handle_pages(url, count, self.movie.id, start_time, end_time)
            #如果页数超过50
            elif count == 50:
                print '数据等于50页,分片爬取'
                self.search_by_day(start_time, end_time)
                return 'End'

    def search_by_day(self, start_time, end_time):
        '''根据天做分割,如果在普通搜索不能完成时间段所有信息的时候使用'''
        print 'search_by_day:::'
        start_time = datetime.datetime.strptime(start_time, '%Y-%m-%d')
        end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d')
        while True:
            timescope = datetime.datetime.strftime(end_time, '%Y-%m-%d') 
            timescope = timescope +':' + timescope
            url = self.url+ quote(self.movie_name) +'&timescope=custom:'+ quote(timescope)+ '&xsort=time&nodup=1'
            print url
            while True:
                content = self.session.get(url).content
                count = self.get_page_count(content)

                if count == 0:
                    if self.is_rebot(content):
                        print '变机器人了,需要帮助, sleep %ss' % REBOT_SLEEP_TIME
                        time.sleep(REBOT_SLEEP_TIME)
                        continue
                    else:
                        print '数据0页~~!'
                #如果页数低于50,直接爬取
                elif count < 50:
                    print '数据小于50页,直接爬取'
                    self.handle_pages(url, count, self.movie.id, end_time, end_time)
                    break
                #如果页数超过50,分小时爬取
                elif count == 50:
                    print '数据等于50页,分片爬取'
                    self.search_by_hour(end_time)
                    break
            end_time = end_time - datetime.timedelta(days=1)
            if start_time > end_time:
                break
            print 'while sleep %ss...' % SEARCH_SLEEP_TIME
            time.sleep(SEARCH_SLEEP_TIME)

    def search_by_hour(self, day):
        '''针对一天按小时分割搜索'''
        print "search_by_hour:::"
        day = datetime.datetime.strftime(day, '%Y-%m-%d')
        l_hour = [i for i in range(0 ,25)] #1-24
        while True:
            i_day = l_hour.pop()
            print 'i_day',i_day
            if i_day == 24:
                end_time = datetime.datetime.strptime(day + '-23', '%Y-%m-%d-%H')
                # end_time = end_time - datetime.timedelta(hours = 1)
            elif i_day == 0:
                break
            else :
                end_time = datetime.datetime.strptime(day + '-' + str(i_day), '%Y-%m-%d-%H')
                end_time = end_time - datetime.timedelta(hours = 1)
            start_time = end_time #- datetime.timedelta(hours=1)

            timescope = datetime.datetime.strftime(start_time, '%Y-%m-%d-%H') 
            timescope = timescope +':' 
            timescope = timescope + datetime.datetime.strftime(end_time, '%Y-%m-%d-%H') 
            url = self.url+ quote(self.movie_name) +'&timescope=custom:'+ quote(timescope) +'&xsort=time&nodup=1'
            print url
            while True:
                content = self.session.get(url).content
                count = self.get_page_count(content)

                if int(count) == 0 or count == '0':
                    if self.is_rebot(content):
                        print '变机器人了,需要帮助,while sleep %ss...' % SEARCH_SLEEP_TIME

                        time.sleep(REBOT_SLEEP_TIME)
                        continue
                    else:
                        print '数据0页~~!'
                self.handle_pages(url, count, self.movie.id, start_time, end_time)
                break
            print 'while sleep %ss...' % SEARCH_SLEEP_TIME
            time.sleep(SEARCH_SLEEP_TIME)

    def get_page_count(self, content):
        max_count = 0
        result = re.findall(r'&page=(\d+)', content)
        result.append(0)
        max_count = max([int(i) for i in result])
        print 'All Page:', result, 'Max Page:-->', max_count
        return max_count

    def is_rebot(self, content):
        content = self.format_content(content)
        result = re.findall(r'我真滴不是机器人', content)
        if len(result) > 0:
            from send_mail import send_mail
            send_mail('机器人', '快填验证码!')
            return True
        else:
            return False

    def handle_pages(self, url_head, page_count, movie_id, start_time, end_time):
        earliest_time = False
        if page_count == 0: page_count = 1
        for i in xrange(1, page_count + 1):
            self.page_count += 1
            url = url_head +"&page=%s" % i
            print 'Start Page','-'*90, '->:%02d' % i,'/', '%02d .' % page_count, 'All Data Page:',self.page_count
            print url
            isreboot = False
            isear = False
            error_num = 0
            while True:
                response = self.session.get(url)
                content = response.content
                isreboot = self.is_rebot(content)
                if isreboot:
                    print '变机器人了,需要帮助,while sleep %ss...' % REBOT_SLEEP_TIME
                    time.sleep(REBOT_SLEEP_TIME)
                    continue
                earliest_time = self.movie.decode_content(content, movie_id, start_time, end_time)
                if not earliest_time:
                    isear = True
                    break
                if earliest_time == 'page_error' :
                    time.sleep(1)
                    error_num +=1
                    if error_num <= 1:
                        continue
                    else :
                        break
                n = random.choice([1,2,3,3,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,10])
                print 'End  Page','-'*90, '->:%02d' % i,'/', '%02d' % page_count, 'while sleep %ss...\n' % n
                time.sleep(n)
                break
            if isear or error_num >= 3:
                break
            time.sleep(2)
        return earliest_time

    def format_content(self, content):
        r = content.decode('unicode_escape').encode("utf-8")
        return r.replace("\/", "/")
Example #3
0
class Controller(object):
    """docstring for Controller"""
    def __init__(self, movie_name):
        super(Controller, self).__init__()
        self.session = login()
        self.movie_name = movie_name
        self.movie = Movie(self.movie_name, self.session)

    def start(self):
        start_time = datetime.datetime.strptime(START_DAY + '-0', '%Y-%m-%d-%H')
        end_time = datetime.datetime.strptime(END_DAY + '-23', '%Y-%m-%d-%H')
        if start_time < end_time:
            earliest_time = self.search_nomarl(start_time, end_time)
            print '普通搜索获取到的最早时间:',earliest_time
            if earliest_time and earliest_time > start_time:
                print '普通搜索没有搜索到足够数据,继续往前爬取!'
                self.search_by_day(start_time, earliest_time)
        else:
            print 'ERROR:起始时间必须比结束时间早!'

    def search_nomarl(self, start_time, end_time):
        '''普通搜索,不设置时间'''
        url = 'http://s.weibo.com/weibo/'+ quote(self.movie_name)
        print url
        while True:
            print 'url -------- true:'
            page_content = self.session.get(url).content
            with file('tttttt','w') as f:
                f.write(page_content)
            print 'file tttttt save ok..~!'
            count = self.get_page_count(page_content)

            if count == 0:
                if self.is_rebot(page_content):
                    print '变机器人了,需要帮助!!'
                    time.sleep(REBOT_SLEEP_TIME)
                    continue

            return self.handle_pages(url, count, self.movie.id, start_time, end_time)

    def search_by_day(self, start_time, end_time):
        '''根据天做分割,如果在普通搜索不能完成时间段所有信息的时候使用'''
        end_time = datetime.datetime.strftime(end_time, '%Y-%m-%d')
        end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d')
        while True:
            timescope = datetime.datetime.strftime(end_time, '%Y-%m-%d') +':' + datetime.datetime.strftime(end_time, '%Y-%m-%d')
            url = 'http://s.weibo.com/weibo/'+ quote(self.movie_name) +'&timescope=custom:'+ quote(timescope)
            print url
            while True:
                content = self.session.get(url).content
                count = self.get_page_count(content)

                if count == 0:
                    if self.is_rebot(content):
                        print '变机器人了,需要帮助!!'
                        time.sleep(REBOT_SLEEP_TIME)
                        continue
                    else:
                        print '数据0页~~!'
                        break
                #如果页数低于50,直接爬取
                elif count < 50:
                    print '数据小于50页,直接爬取'
                    break
                #如果页数超过50,分小时爬取
                elif count == 50:
                    print '数据等于50页,分片爬取'
                    self.search_by_hour(end_time)
                    continue

            day = datetime.datetime.strftime(end_time, '%Y-%m-%d')
            search_start_time = datetime.datetime.strptime(day + '-0', '%Y-%m-%d-%H')
            search_end_time = datetime.datetime.strptime(day + '-23', '%Y-%m-%d-%H')

            self.handle_pages(url, count, self.movie.id, search_start_time, search_end_time)
            end_time = end_time - datetime.timedelta(days=1)
            if start_time > end_time:
                break
            time.sleep(SEARCH_SLEEP_TIME)

    def search_by_hour(self, day):
        '''针对一天按小时分割搜索'''
        day = datetime.datetime.strftime(day, '%Y-%m-%d')
        start_time = datetime.datetime.strptime(day + '-0', '%Y-%m-%d-%H')
        end_time = datetime.datetime.strptime(day + '-23', '%Y-%m-%d-%H')

        while True:
            timescope = datetime.datetime.strftime(end_time, '%Y-%m-%d-%H') +':' + datetime.datetime.strftime(end_time, '%Y-%m-%d-%H')
            url = 'http://s.weibo.com/weibo/'+ quote(self.movie_name) +'&timescope=custom:'+ quote(timescope)
            print url
            while True:
                content = self.session.get(url).content
                count = self.get_page_count(content)

                if count == 0:
                    if self.is_rebot(content):
                        print '变机器人了,需要帮助!!'
                        time.sleep(REBOT_SLEEP_TIME)
                        continue
                    else:
                        break
                else:
                    break

            hour = datetime.datetime.strftime(end_time, '%Y-%m-%d-%H')
            search_start_time = datetime.datetime.strptime(hour + '-0', '%Y-%m-%d-%H-%M')
            search_end_time = datetime.datetime.strptime(hour + '-59', '%Y-%m-%d-%H-%M')

            self.handle_pages(url, count, self.movie.id, search_start_time, search_end_time)
            end_time = end_time - datetime.timedelta(hours=1)
            if start_time > end_time:
                break
            time.sleep(SEARCH_SLEEP_TIME)

    def get_page_count(self, content):
        max_count = 0
        result = re.findall(r'&page=\d+', content)
        with file('ssssssss','w') as f:
            f.write(content)
        print 'file ssssssss save ok..~!'
        print 'page ----------------------->:',result
        for index in result:
            info = index.split('=')
            if int(info[1]) > max_count:
                max_count = int(info[1])
        print '当前搜索结果页数:', max_count
        return max_count

    def is_rebot(self, content):
        content = self.format_content(content)
        result = re.findall(r'我真滴不是机器人', content)
        if len(result) > 0:
            send_mail('机器人', '快填验证码!')
            return True
        else:
            return False

    def handle_pages(self, url_head, page_count, movie_id, start_time, end_time):
        earliest_time = False
        for i in xrange(1, page_count + 1):
            page = str(i)
            url = url_head +"&page=" + page
            print url
            isreboot = False
            isear = False
            while True:
                response = self.session.get(url)
                content = response.content
                isreboot = self.is_rebot(content)
                if isreboot:
                    print '变机器人了,需要帮助!!'
                    time.sleep(REBOT_SLEEP_TIME)
                    continue
                earliest_time = self.movie.decode_content(content, movie_id, start_time, end_time)
                if not earliest_time:
                    isear = True
                    break
                print '现在是第',page,'/',page_count,'页'
                time.sleep(5)
                break
            if isear:
                break
        return earliest_time

    def format_content(self, content):
        r = content.decode('unicode_escape').encode("utf-8")
        return r.replace("\/", "/")