Example #1
0
class DoubanHtmlParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        # If True, the content of this tag is a movie entry.
        self._output_entry = False
        # If this tag is closed, finish this movie entry.
        self._output_entry_finish_tag = ''

        # If True, the content of this tag is movie name.
        self._start_movie_name = False
        # If True, the content of this tag is movie intro.
        self._start_intro = False
        # If True, the content of this tag is movie viewing date.
        self._start_date = False

        # If True, store next content in _current_data
        self._grab_data = False
        # If this tag is encountered, finish current entry/attributes
        # and update current movie object.
        self._output_finish_tag = ''

        # A list of movie object
        self._movie_list = []
        # Current discovered movie object
        self._movie = None

        # a buffer storing data.
        self._current_data = ''

    def reset(self):
        HTMLParser.reset(self)
        # Reser clears movie list
        self._movie_list = []

    def clear(self):
        # clear all the states of current movie.
        # Prepare for next movie entry.
        self._output_entry = False
        self._output_entry_finish_tag = ''

        self._start_movie_name = False
        self._start_intro = False
        self._start_date = False

        self._grab_data = False
        self._output_finish_tag = ''

        self._current_data = ''
        self._movie = None

    # Invoked when a html start tag is parsed.
    def handle_starttag(self, tag, attrs):
        if tag == 'div':
            for attr in attrs:
                if attr[0] == 'class' and attr[1] == 'info':
                    # <div class="info"> starts a new movie entry.
                    self._output_entry = True
                    self._movie = Movie()
                    self._output_entry_finish_tag = tag
        elif self._output_entry:
            if tag == 'li':
                for attr in attrs:
                    if attr[0] == 'class' and attr[1] == 'title':
                        # <li class="title"> includes movie name
                        self._start_movie_name = True
                    elif attr[0] == 'class' and attr[1] == 'intro':
                        # <li class="intro"> includes movie intro
                        self._output_finish_tag = tag
                        self._grab_data = True
                        self._start_intro = True
            elif tag == 'span':
                for attr in attrs:
                    if attr[0] == 'class' and attr[1] == 'date':
                        # <span class="date"> includes movie view date.
                        self._output_finish_tag = tag
                        self._grab_data = True
                        self._start_date = True
                        #TODO
                        # watched list has
                        #     <dev class="info">
                        #       everything
                        #     </dev>
                        # but wish list has
                        #     <dev class="info">
                        #        movie info
                        #     </dev>
                        #     <li>
                        #        entry creation day
                        #     </li>
                    if attr[0] == 'class' and attr[1].find('rating') >= 0:
                        # <span class="ratingX-t"> has rating information.
                        def GetRating(s):
                            return ((s.split('rating'))[1].split('-'))[0]
                        self._movie.set_rating(int(GetRating(attr[1])))
            elif self._start_movie_name and tag == MOVIE_NAME_START_TAG:
                # movie name tag is encountered. Store next data.
                self._output_finish_tag = tag
                self._grab_data = True

    # Invoked when endtag is parsed.
    def handle_endtag(self, tag):
        if tag == self._output_entry_finish_tag:
            # This is a movie entry end tag. Finishi this movie object.
            self._movie_list.append(self._movie)
            self.clear()
        elif tag == self._output_finish_tag:
            if self._start_movie_name:
                # This is a movie name end tag.
                self._movie.set_name(self._current_data)
                self._start_movie_name = False
                self._current_data = ''
            if self._start_intro:
                # this is a movie intro end tag.
                self._movie.set_intro(self._current_data)
                self._start_intro = False
                self._current_data = ''
            if self._start_date:
                # this is a movie date end tag.
                self._movie.set_date(self._current_data)
                self._start_date = False
                self._current_data = ''
            # Stop storing data.
            self._grab_data = False
    
    # Invoked when a data if parsed.
    def handle_data(self, data):
        if self._grab_data:
            # Upon storing data, append all the data to _current_data.
            self._current_data += data

    # return the list of movies parsed.
    def get_movie_list(self):
        return self._movie_list