def testMechanizeOpener(self): test_url = 'http://www.baidu.com' opener = MechanizeOpener() assert 'baidu' in opener.open(test_url) br = opener.browse_open(test_url) assert u'百度' in br.title() assert 'baidu' in br.response().read()
def testMechanizeOpener(self): test_url = 'http://www.baidu.com' opener = MechanizeOpener() assert 'baidu' in opener.open(test_url) br = opener.browse_open(test_url) assert u'百度' in br.title() assert 'baidu' in br.response().read()
def testMechanizeOpener(self): test_url = "http://www.baidu.com" opener = MechanizeOpener() assert "baidu" in opener.open(test_url) br = opener.browse_open(test_url) assert "百度" in br.title() assert "baidu" in br.response().read()
class WikiParser(Parser): def __init__(self, opener=None, url=None, **kw): super(WikiParser, self).__init__(opener=opener, url=url, **kw) if self.opener is None: self.opener = MechanizeOpener() self.html_comment_reg = re.compile(r'<!--[^-]+-->', re.DOTALL) self.en_time_reg = re.compile(r'\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}') self.zh_time_reg = re.compile(ur'\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}') def store(self, title, content, last_update): try: doc = WikiDocument.objects.get(title=title) if last_update > doc.last_update: doc.content = content doc.last_update = last_update doc.update(upsert=True) except DoesNotExist: doc = WikiDocument(title=title, content=content, last_update=last_update) doc.save() def _extract(self, soup): if soup.head is None: return None, None, None title = soup.head.title.text if '-' in title: title = title.split('-')[0].strip() content = soup.find('div', attrs={'id': 'mw-content-text', 'class': 'mw-content-ltr'}) while content.table is not None: content.table.extract() content = content.text last_update_str = soup.find('li', attrs={'id': 'footer-info-lastmod'}).text last_update = None match_en_time = self.en_time_reg.search(last_update_str) if match_en_time: last_update = match_en_time.group() last_update = parse(last_update) match_zh_time = self.zh_time_reg.search(last_update_str) if match_zh_time: last_update = match_zh_time.group() last_update = re.sub(r'\([^\)]+\)\s', '', last_update) last_update = last_update.replace(u'年', '-').replace(u'月', '-').replace(u'日', '') last_update = parse(last_update) if last_update is None: last_update = datetime.now() return title, content, last_update def parse(self, url=None): url = url or self.url lang = url.strip('http://').split('.', 1)[0] br = self.opener.browse_open(url) html = br.response().read() html = self.html_comment_reg.sub('', html) soup = BeautifulSoup(html) title, content, last_update = self._extract(soup) if title is None: return [] title = title + ' ' + lang self.store(title, content, last_update) def _is_same(out_url): return out_url.rsplit('#', 1)[0] == url links = [] for link in br.links(): if link.url.startswith('http://'): out_url = link.url if not _is_same(out_url): links.append(out_url) else: out_url = urlparse.urljoin(link.base_url, link.url) if not _is_same(out_url): links.append(out_url) return links
class WikiParser(Parser): def __init__(self, opener=None, url=None, **kw): super(WikiParser, self).__init__(opener=opener, url=url, **kw) if self.opener is None: self.opener = MechanizeOpener() self.html_comment_reg = re.compile(r'<!--[^-]+-->', re.DOTALL) self.en_time_reg = re.compile( r'\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}') self.zh_time_reg = re.compile( ur'\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}') def store(self, title, content, last_update): try: doc = WikiDocument.objects.get(title=title) if last_update > doc.last_update: doc.content = content doc.last_update = last_update doc.update(upsert=True) except DoesNotExist: doc = WikiDocument(title=title, content=content, last_update=last_update) doc.save() def _extract(self, soup): if soup.head is None: return None, None, None title = soup.head.title.text if '-' in title: title = title.split('-')[0].strip() content = soup.find('div', attrs={ 'id': 'mw-content-text', 'class': 'mw-content-ltr' }) while content.table is not None: content.table.extract() content = content.text last_update_str = soup.find('li', attrs={ 'id': 'footer-info-lastmod' }).text last_update = None match_en_time = self.en_time_reg.search(last_update_str) if match_en_time: last_update = match_en_time.group() last_update = parse(last_update) match_zh_time = self.zh_time_reg.search(last_update_str) if match_zh_time: last_update = match_zh_time.group() last_update = re.sub(r'\([^\)]+\)\s', '', last_update) last_update = last_update.replace(u'年', '-').replace(u'月', '-').replace( u'日', '') last_update = parse(last_update) if last_update is None: last_update = datetime.now() return title, content, last_update def parse(self, url=None): url = url or self.url lang = url.strip('http://').split('.', 1)[0] br = self.opener.browse_open(url) html = br.response().read() html = self.html_comment_reg.sub('', html) soup = BeautifulSoup(html) title, content, last_update = self._extract(soup) if not title: return title = title + ' ' + lang self.store(title, content, last_update) def _is_same(out_url, url): return out_url.rsplit('#', 1)[0] == url for link in br.links(): q = urlparse.urlparse(link.url) if q.scheme in ['http', 'https']: out_url = link.url if not _is_same(out_url, url): yield out_url elif not q.scheme: out_url = urlparse.urljoin(link.base_url, link.url) if not _is_same(out_url, url): yield out_url
class DoubanMovieParser(Parser): def __init__(self, opener=None, url=None, bundle=None, **kwargs): super(DoubanMovieParser, self).__init__(opener=opener, url=url, **kwargs) if self.opener is None: self.opener = MechanizeOpener() self.url = url self.opener.set_default_timeout(TIMEOUT) if not hasattr(self, 'logger') or self.logger is None: self.logger = get_logger(name='douban_parser') def get_subject_id(self, url): """ extract subject id from url """ id_arr = re.findall('https://movie.douban.com/subject/(\d+)', url) if id_arr: return id_arr[0] def _check_url(self, dest_url, src_url): """ check whether url are same domain path """ return dest_url.split('?')[0] == src_url.split('?')[0] def check(self, url, br): dest_url = br.geturl() if not self._check_url(dest_url, url): if dest_url.startswith('http://douban.com/login.php'): raise DoubanLoginFailure('Douban not login or login expired') return True def get_movie_subject(self, sid): try: movie = getattr(DoubanMovie, 'objects').get(sid=sid) except DoesNotExist: movie = DoubanMovie(sid=sid) movie.save() return movie def parse(self, url=None): url = url or self.url sid = self.get_subject_id(url) movie = self.get_movie_subject(sid) print(datetime.utcnow()) # if entry has updated in latest 24 hours, skip this url if movie.last_update and abs( (datetime.utcnow() - movie.last_update).days) > 1: self.logger.warn('Skip vistied url: %s' % url) return self.logger.debug('proxy:{}'.format(self.opener.proxies)) try: br = self.opener.browse_open(url) except URLError: raise FetchBannedError() if not self.check(url, br): return html = br.response().read() if html == None: raise FetchBannedError() soup = beautiful_soup(html) if re.compile('<span class="pl">集数:</span>').findall(html): subtype = 't' else: subtype = 'm' try: title = soup.select( "span[property='v:itemreviewed']")[0].text.strip() except: raise FetchBannedError() year_tags = soup.select("div#content > h1 span.year") if year_tags: year = year_tags[0].text[1:-1] else: year = None # self.logger.debug(title) summary_tags = soup.select("span[property='v:summary']") summary = summary_tags[0].text.strip() if summary_tags else '' # tags tag_tags = soup.select('div .tags-body a') tags = [t.text for t in tag_tags] # get directors director_tags = soup.select('div #info > span a[rel="v:directedBy"]') p1 = re.compile(r'<[^>]+>(?P<director>[^<]+)</a>') directors = [p1.match(str(t)).group('director') for t in director_tags] # get stars star_tags = soup.select('div #info > span a[rel="v:starring"]') p2 = re.compile(r'<[^>]+>(?P<star>[^<]+)</a>') casts = [p2.match(str(t)).group('star') for t in star_tags] # get writers writers_tags = soup.select('div #info > span')[1].select('a') p2 = re.compile(r'<[^>]+>(?P<writer>[^<]+)</a>') writers = [p2.match(str(t)).group('writer') for t in writers_tags] # get genre genre_tags = soup.select('div #info > span[property="v:genre"]') p3 = re.compile(r'<span property="v:genre">(?P<genre>[^<]+)</span>') genres = [p3.match(str(t)).group('genre') for t in genre_tags] # get release date pubdate_tag = soup.select( 'div #info > span[property="v:initialReleaseDate"]') f4 = 0 if pubdate_tag: p41 = re.compile( r'<[^>]+>(?P<pubdate>[^(]+)[(]中国大陆([ ]3D)*[)]<[^>]+>') p42 = re.compile( r'<[^>]+>(?P<pubdate>[^(]+)[(]中国内地([ ]3D)*[)]<[^>]+>') p43 = re.compile( r'<[^>]+>(?P<pubdate>[^(]+)[(]香港([ ]3D)*[)]<[^>]+>') p44 = re.compile(r'[0-9-]+') for t in pubdate_tag: m = p41.search(str(t)) if m != None: f4 = 1 pubdate = m.group('pubdate') break m = p42.search(str(t)) if m != None: f4 = 1 pubdate = m.group('pubdate') break m = p43.search(str(t)) if m != None: f4 = 1 pubdate = m.group('pubdate') break m = p44.search(str(t)) if m != None: f4 = 1 pubdate = m.group() break if f4 == 0: self.logger.critical('{0} has no pubdate'.format(sid)) pubdate = year # append month/date if just year is known if len(pubdate) == 4: pubdate = pubdate + "-6-30" elif len(pubdate) == 7: pubdate = pubdate + "-15" pubdate = datetime.strptime(pubdate, '%Y-%m-%d') if not year: year = pubdate.strftime('%Y') # get wishes wishes_tags = soup.select( 'div #subject-others-interests > .subject-others-interests-ft > a') #print wishes_tags if len(wishes_tags) == 0: self.logger.critical('{0} donnot have wish count'.format(sid)) wish_count = None collect_count = None for i in range(len(wishes_tags)): m = re.match(u'(?P<wishes>[0-9]+)人想看', wishes_tags[i].text) if m: wish_count = m.group('wishes') continue m = re.match(u'(?P<collections>[0-9]+)人看过', wishes_tags[i].text) if m: collect_count = m.group('collections') rating_num = soup.select(r'strong.rating_num')[0].text if not rating_num: rating_num = None rating_lvls = soup.select(r'div.ratings-on-weight span.rating_per') if rating_lvls: rating_lvls = [float(r.text[:-1]) for r in rating_lvls] # season season_tags = soup.select('div #info select#season]') if season_tags: movie.seasons_count = season_tags.count movie.current_season = season_tags[0].select( 'option[selected]')[0].text photo_url = soup.select('a[class="nbgnbg"] img')[0].attrs['src'] #region save movie def parseNumber(v): m = re.findall('(\d+).*', v) if m: return int(m[0]) else: # parse chinese return convert(v.strip()) info_map = { u'制片国家/地区': { 'field': 'countries' }, u'语言': { 'field': 'languages' }, u'集数': { 'field': 'episodes_count', 'func': parseNumber }, u'单集片长': { 'field': 'duration', 'func': parseNumber }, u'片长': { 'field': 'duration', 'func': parseNumber }, u'又名': { 'field': 'aka', 'func': lambda v: v.split('/') }, u'IMDb链接': { 'field': 'imdb_id' } } info_str = soup.select('div #info')[0].text for k, f in info_map.items(): v = re.findall(k + "\:(.*)", info_str, re.MULTILINE) if v: func = (lambda s: s.strip()) \ if 'func' not in f \ else f['func'] f_val = func(v[0].strip()) setattr(movie, f['field'], f_val) movie.sid = sid movie.title = title movie.photo_alt = photo_url movie.year = year movie.summary = summary movie.tags = tags movie.subtype = subtype movie.directors = directors movie.casts = casts movie.writers = writers if rating_num: movie.rating = float(rating_num) if rating_lvls: movie.high_rating_pct = rating_lvls[0] + rating_lvls[1] movie.low_rating_pct = rating_lvls[3] + rating_lvls[4] if wish_count: movie.wish_count = wish_count if collect_count: movie.collect_count = collect_count movie.pubdate = pubdate movie.genres = genres movie.alt = url movie.last_update = datetime.now() movie.save() def _is_same(out_url, url): return out_url.rsplit('#', 1)[0] == url next_urls = soup.select("div.recommendations-bd a") for link in next_urls: out_url = link.attrs['href'] if not _is_same(out_url, url) and out_url.startswith( "https://movie.douban.com/subject"): sid_next = self.get_subject_id(out_url) if sid_next != sid: yield out_url
class WikiParser(Parser): def __init__(self, opener=None, url=None, **kw): super(WikiParser, self).__init__(opener=opener, url=url, **kw) if self.opener is None: self.opener = MechanizeOpener() self.html_comment_reg = re.compile(r"<!--[^-]+-->", re.DOTALL) self.en_time_reg = re.compile(r"\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}") self.zh_time_reg = re.compile(ur"\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}") def store(self, title, content, last_update): try: doc = WikiDocument.objects.get(title=title) if last_update > doc.last_update: doc.content = content doc.last_update = last_update doc.update(upsert=True) except DoesNotExist: doc = WikiDocument(title=title, content=content, last_update=last_update) doc.save() def _extract(self, soup): if soup.head is None: return None, None, None title = soup.head.title.text if "-" in title: title = title.split("-")[0].strip() content = soup.find("div", attrs={"id": "mw-content-text", "class": "mw-content-ltr"}) while content.table is not None: content.table.extract() content = content.text last_update_str = soup.find("li", attrs={"id": "footer-info-lastmod"}).text last_update = None match_en_time = self.en_time_reg.search(last_update_str) if match_en_time: last_update = match_en_time.group() last_update = parse(last_update) match_zh_time = self.zh_time_reg.search(last_update_str) if match_zh_time: last_update = match_zh_time.group() last_update = re.sub(r"\([^\)]+\)\s", "", last_update) last_update = last_update.replace(u"年", "-").replace(u"月", "-").replace(u"日", "") last_update = parse(last_update) if last_update is None: last_update = datetime.now() return title, content, last_update def parse(self, url=None): url = url or self.url lang = url.strip("http://").split(".", 1)[0] br = self.opener.browse_open(url) html = br.response().read() html = self.html_comment_reg.sub("", html) soup = BeautifulSoup(html) title, content, last_update = self._extract(soup) if title is None: return title = title + " " + lang self.store(title, content, last_update) def _is_same(out_url): return out_url.rsplit("#", 1)[0] == url for link in br.links(): if link.url.startswith("http://"): out_url = link.url if not _is_same(out_url): yield out_url else: out_url = urlparse.urljoin(link.base_url, link.url) if not _is_same(out_url): yield out_url