class DoubanHtmlParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) # If True, the content of this tag is a movie entry. self._output_entry = False # If this tag is closed, finish this movie entry. self._output_entry_finish_tag = '' # If True, the content of this tag is movie name. self._start_movie_name = False # If True, the content of this tag is movie intro. self._start_intro = False # If True, the content of this tag is movie viewing date. self._start_date = False # If True, store next content in _current_data self._grab_data = False # If this tag is encountered, finish current entry/attributes # and update current movie object. self._output_finish_tag = '' # A list of movie object self._movie_list = [] # Current discovered movie object self._movie = None # a buffer storing data. self._current_data = '' def reset(self): HTMLParser.reset(self) # Reser clears movie list self._movie_list = [] def clear(self): # clear all the states of current movie. # Prepare for next movie entry. self._output_entry = False self._output_entry_finish_tag = '' self._start_movie_name = False self._start_intro = False self._start_date = False self._grab_data = False self._output_finish_tag = '' self._current_data = '' self._movie = None # Invoked when a html start tag is parsed. def handle_starttag(self, tag, attrs): if tag == 'div': for attr in attrs: if attr[0] == 'class' and attr[1] == 'info': # <div class="info"> starts a new movie entry. self._output_entry = True self._movie = Movie() self._output_entry_finish_tag = tag elif self._output_entry: if tag == 'li': for attr in attrs: if attr[0] == 'class' and attr[1] == 'title': # <li class="title"> includes movie name self._start_movie_name = True elif attr[0] == 'class' and attr[1] == 'intro': # <li class="intro"> includes movie intro self._output_finish_tag = tag self._grab_data = True self._start_intro = True elif tag == 'span': for attr in attrs: if attr[0] == 'class' and attr[1] == 'date': # <span class="date"> includes movie view date. self._output_finish_tag = tag self._grab_data = True self._start_date = True #TODO # watched list has # <dev class="info"> # everything # </dev> # but wish list has # <dev class="info"> # movie info # </dev> # <li> # entry creation day # </li> if attr[0] == 'class' and attr[1].find('rating') >= 0: # <span class="ratingX-t"> has rating information. def GetRating(s): return ((s.split('rating'))[1].split('-'))[0] self._movie.set_rating(int(GetRating(attr[1]))) elif self._start_movie_name and tag == MOVIE_NAME_START_TAG: # movie name tag is encountered. Store next data. self._output_finish_tag = tag self._grab_data = True # Invoked when endtag is parsed. def handle_endtag(self, tag): if tag == self._output_entry_finish_tag: # This is a movie entry end tag. Finishi this movie object. self._movie_list.append(self._movie) self.clear() elif tag == self._output_finish_tag: if self._start_movie_name: # This is a movie name end tag. self._movie.set_name(self._current_data) self._start_movie_name = False self._current_data = '' if self._start_intro: # this is a movie intro end tag. self._movie.set_intro(self._current_data) self._start_intro = False self._current_data = '' if self._start_date: # this is a movie date end tag. self._movie.set_date(self._current_data) self._start_date = False self._current_data = '' # Stop storing data. self._grab_data = False # Invoked when a data if parsed. def handle_data(self, data): if self._grab_data: # Upon storing data, append all the data to _current_data. self._current_data += data # return the list of movies parsed. def get_movie_list(self): return self._movie_list