def parse_novel_page(self, url, html): soup = BeautifulSoup(html) if soup is None: print "soup is None" return None novel = {'url': url} site = utils.get_site(url) novel['site'] = site box_intro = soup.find('div', {'class': 'box_intro'}) pic = box_intro.find('div', {'class': 'pic'}) novel['image'] = pic.img.get('src') info = box_intro.find('div', {'class': 'box_info'}) if info is None: return None rank = {} ren = info.find('em', {'id': 'ren'}) if ren is not None: rank['review_count'] = int(ren.text.encode('utf-8')) novel['rank'] = rank h1 = info.find('h1') if h1 is None: return None name_author = h1.text.encode('utf-8') if "作者:" not in name_author: return None name, author = name_author.split("作者:") novel['name'] = name.strip() novel['author'] = author.strip() novel['bid'] = utils.make_book_id(name, author) novel['nid'] = utils.make_novel_id(name, author, site) desc = info.find('div', {'class': 'intro'}) novel['desc'] = desc.text.encode("utf-8").replace(" ", "").strip().lstrip('\r') infos = info.find('tr', {'valign': 'top'}) if infos is not None: tds = infos.findAll('td') for td in tds: txt = td.text.encode('utf-8') tmp = txt.split(':', 1) if len(tmp) != 2: continue if tmp[0] == '文章分类': novel['category'] = tmp[1] elif tmp[0] == '文章状态': novel['status'] = tmp[1] options = info.find('div', {'class': 'option'}) if options is not None: list_page = options.find('span', {'class': 'btopt'}).a.get('href') novel['list_url'] = list_page return novel
def __get_crawler__(self, url): site = utils.get_site(url) if site is None: logger.debug("Unsupported site: %s" % url) return None crawler = self.crawlers.get(site, None) if crawler is None: logger.debug("Unsupported site: %s" % url) return crawler
def _get_profile_manager(site): """Gets a new UserProfileManager""" site = utils.get_site(site) context = ServerContext.GetContext(site) return UserProfileManager(context)