class ImpMakerParser(Parser): def __init__(self, opener=None, url=None, **kw): super(ImpMakerParser, self).__init__(opener=opener, url=url, **kw) self.logger = kw.get('logger') def parse(self, url=None): url = url or self.url if 'click' not in url: times = random.randrange(2, 5) else: times = 1 i = 0 self.opener = MechanizeOpener(user_agent=random_user_agent()) odds = random.randint(0, 100) if 'click' not in url or odds <= 5: # add proxy p_ = get_ip_proxy() if p_: self.opener.remove_proxy() self.opener.add_proxy(p_) while i < times: html = self.opener.open(url) #print(html) i = i + 1 time.sleep(.1) return url
def testMechanizeOpener(self): test_url = "http://www.baidu.com" opener = MechanizeOpener() assert "baidu" in opener.open(test_url) br = opener.browse_open(test_url) assert "百度" in br.title() assert "baidu" in br.response().read()
def testMechanizeOpener(self): test_url = 'http://www.baidu.com' opener = MechanizeOpener() assert 'baidu' in opener.open(test_url) br = opener.browse_open(test_url) assert u'百度' in br.title() assert 'baidu' in br.response().read()
def __init__(self, opener=None, url=None, **kw): super(WikiParser, self).__init__(opener=opener, url=url, **kw) if self.opener is None: self.opener = MechanizeOpener() self.html_comment_reg = re.compile(r'<!--[^-]+-->', re.DOTALL) self.en_time_reg = re.compile( r'\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}') self.zh_time_reg = re.compile( ur'\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}')
def __init__(self, opener=None, url=None, bundle=None, **kwargs): super(DoubanMovieParser, self).__init__(opener=opener, url=url, **kwargs) if self.opener is None: self.opener = MechanizeOpener() self.url = url self.opener.set_default_timeout(TIMEOUT) if not hasattr(self, 'logger') or self.logger is None: self.logger = get_logger(name='douban_parser')
def __init__(self, opener=None, url=None, **kw): super(WikiParser, self).__init__(opener=opener, url=url, **kw) if self.opener is None: self.opener = MechanizeOpener() self.html_comment_reg = re.compile(r'<!--[^-]+-->', re.DOTALL) self.en_time_reg = re.compile(r'\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}') self.zh_time_reg = re.compile(ur'\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}')
def refresh_cookies(self, ck_dir): """ refresh cookie db """ idx = 0 # del all cookies if os.path.isdir(cookie_dir): shutil.rmtree(cookie_dir) os.mkdir(cookie_dir) # add cookie from folder accounts = [] for root, dirs, files in os.walk(ck_dir): for filespath in files: full_name = os.path.join(root, filespath) with open(full_name) as f: for line in f.readlines(): if line: u, p = line.split('\t') if u and p: accounts.append((u.strip(), p.strip())) # save cookie for u, p in accounts: opener = MechanizeOpener( user_agent= 'Baiduspider+(+http://www.baidu.com/search/spider.htm)', timeout=10) opener.browser.set_proxies({'http': get_ip_proxy(size=10)}) lm = WeiboLogin(opener, u, p) try: status = lm.login() except Exception as ex: self.logger.warn("login error:%s" % u) self.logger.error(ex) continue if status: idx += 1 opener.cj.save(os.path.join(cookie_dir, '%d.txt' % idx), ignore_discard=True, ignore_expires=True) self.validated.append("%s\t%s\r\n" % (u, p)) opener.close()
def setUp(self): self.test_uid = '1784725941' self.bundle = WeiboUserBundle(self.test_uid) self.opener = MechanizeOpener() self.conn = Connection() self.db = self.conn[getattr(user_config.job, 'db')] self.collection = self.db.weibo_user assert len(user_config.job['login']) > 0 login_hook(self.opener, **user_config.job['login'][0])
def setUp(self): self.test_uid = '1667486960' self.bundle = WeiboUserBundle(self.test_uid) self.opener = MechanizeOpener() self.conn = MongoClient() self.db = self.conn[getattr(user_config.job, 'db')] self.users_collection = self.db.weibo_user self.weibos_collection = self.db.micro_blog #assert len(user_config.job['login']) > 0 login_hook(self.opener, **user_config.job['login'][0])
def parse(self, url=None): url = url or self.url if 'click' not in url: times = random.randrange(2, 5) else: times = 1 i = 0 self.opener = MechanizeOpener(user_agent=random_user_agent()) odds = random.randint(0, 100) if 'click' not in url or odds <= 5: # add proxy p_ = get_ip_proxy() if p_: self.opener.remove_proxy() self.opener.add_proxy(p_) while i < times: html = self.opener.open(url) #print(html) i = i + 1 time.sleep(.1) return url
from cola.core.opener import MechanizeOpener import re for i in range(100): browser = MechanizeOpener().open('https://google.com') mainform = re.search('<input .+?>', browser).group() print(mainform)
class WikiParser(Parser): def __init__(self, opener=None, url=None, **kw): super(WikiParser, self).__init__(opener=opener, url=url, **kw) if self.opener is None: self.opener = MechanizeOpener() self.html_comment_reg = re.compile(r"<!--[^-]+-->", re.DOTALL) self.en_time_reg = re.compile(r"\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}") self.zh_time_reg = re.compile(ur"\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}") def store(self, title, content, last_update): try: doc = WikiDocument.objects.get(title=title) if last_update > doc.last_update: doc.content = content doc.last_update = last_update doc.update(upsert=True) except DoesNotExist: doc = WikiDocument(title=title, content=content, last_update=last_update) doc.save() def _extract(self, soup): if soup.head is None: return None, None, None title = soup.head.title.text if "-" in title: title = title.split("-")[0].strip() content = soup.find("div", attrs={"id": "mw-content-text", "class": "mw-content-ltr"}) while content.table is not None: content.table.extract() content = content.text last_update_str = soup.find("li", attrs={"id": "footer-info-lastmod"}).text last_update = None match_en_time = self.en_time_reg.search(last_update_str) if match_en_time: last_update = match_en_time.group() last_update = parse(last_update) match_zh_time = self.zh_time_reg.search(last_update_str) if match_zh_time: last_update = match_zh_time.group() last_update = re.sub(r"\([^\)]+\)\s", "", last_update) last_update = last_update.replace(u"年", "-").replace(u"月", "-").replace(u"日", "") last_update = parse(last_update) if last_update is None: last_update = datetime.now() return title, content, last_update def parse(self, url=None): url = url or self.url lang = url.strip("http://").split(".", 1)[0] br = self.opener.browse_open(url) html = br.response().read() html = self.html_comment_reg.sub("", html) soup = BeautifulSoup(html) title, content, last_update = self._extract(soup) if title is None: return title = title + " " + lang self.store(title, content, last_update) def _is_same(out_url): return out_url.rsplit("#", 1)[0] == url for link in br.links(): if link.url.startswith("http://"): out_url = link.url if not _is_same(out_url): yield out_url else: out_url = urlparse.urljoin(link.base_url, link.url) if not _is_same(out_url): yield out_url
class WikiParser(Parser): def __init__(self, opener=None, url=None, **kw): super(WikiParser, self).__init__(opener=opener, url=url, **kw) if self.opener is None: self.opener = MechanizeOpener() self.html_comment_reg = re.compile(r'<!--[^-]+-->', re.DOTALL) self.en_time_reg = re.compile(r'\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}') self.zh_time_reg = re.compile(ur'\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}') def store(self, title, content, last_update): try: doc = WikiDocument.objects.get(title=title) if last_update > doc.last_update: doc.content = content doc.last_update = last_update doc.update(upsert=True) except DoesNotExist: doc = WikiDocument(title=title, content=content, last_update=last_update) doc.save() def _extract(self, soup): if soup.head is None: return None, None, None title = soup.head.title.text if '-' in title: title = title.split('-')[0].strip() content = soup.find('div', attrs={'id': 'mw-content-text', 'class': 'mw-content-ltr'}) while content.table is not None: content.table.extract() content = content.text last_update_str = soup.find('li', attrs={'id': 'footer-info-lastmod'}).text last_update = None match_en_time = self.en_time_reg.search(last_update_str) if match_en_time: last_update = match_en_time.group() last_update = parse(last_update) match_zh_time = self.zh_time_reg.search(last_update_str) if match_zh_time: last_update = match_zh_time.group() last_update = re.sub(r'\([^\)]+\)\s', '', last_update) last_update = last_update.replace(u'年', '-').replace(u'月', '-').replace(u'日', '') last_update = parse(last_update) if last_update is None: last_update = datetime.now() return title, content, last_update def parse(self, url=None): url = url or self.url lang = url.strip('http://').split('.', 1)[0] br = self.opener.browse_open(url) html = br.response().read() html = self.html_comment_reg.sub('', html) soup = BeautifulSoup(html) title, content, last_update = self._extract(soup) if title is None: return [] title = title + ' ' + lang self.store(title, content, last_update) def _is_same(out_url): return out_url.rsplit('#', 1)[0] == url links = [] for link in br.links(): if link.url.startswith('http://'): out_url = link.url if not _is_same(out_url): links.append(out_url) else: out_url = urlparse.urljoin(link.base_url, link.url) if not _is_same(out_url): links.append(out_url) return links
class DoubanMovieParser(Parser): def __init__(self, opener=None, url=None, bundle=None, **kwargs): super(DoubanMovieParser, self).__init__(opener=opener, url=url, **kwargs) if self.opener is None: self.opener = MechanizeOpener() self.url = url self.opener.set_default_timeout(TIMEOUT) if not hasattr(self, 'logger') or self.logger is None: self.logger = get_logger(name='douban_parser') def get_subject_id(self, url): """ extract subject id from url """ id_arr = re.findall('https://movie.douban.com/subject/(\d+)', url) if id_arr: return id_arr[0] def _check_url(self, dest_url, src_url): """ check whether url are same domain path """ return dest_url.split('?')[0] == src_url.split('?')[0] def check(self, url, br): dest_url = br.geturl() if not self._check_url(dest_url, url): if dest_url.startswith('http://douban.com/login.php'): raise DoubanLoginFailure('Douban not login or login expired') return True def get_movie_subject(self, sid): try: movie = getattr(DoubanMovie, 'objects').get(sid=sid) except DoesNotExist: movie = DoubanMovie(sid=sid) movie.save() return movie def parse(self, url=None): url = url or self.url sid = self.get_subject_id(url) movie = self.get_movie_subject(sid) print(datetime.utcnow()) # if entry has updated in latest 24 hours, skip this url if movie.last_update and abs( (datetime.utcnow() - movie.last_update).days) > 1: self.logger.warn('Skip vistied url: %s' % url) return self.logger.debug('proxy:{}'.format(self.opener.proxies)) try: br = self.opener.browse_open(url) except URLError: raise FetchBannedError() if not self.check(url, br): return html = br.response().read() if html == None: raise FetchBannedError() soup = beautiful_soup(html) if re.compile('<span class="pl">集数:</span>').findall(html): subtype = 't' else: subtype = 'm' try: title = soup.select( "span[property='v:itemreviewed']")[0].text.strip() except: raise FetchBannedError() year_tags = soup.select("div#content > h1 span.year") if year_tags: year = year_tags[0].text[1:-1] else: year = None # self.logger.debug(title) summary_tags = soup.select("span[property='v:summary']") summary = summary_tags[0].text.strip() if summary_tags else '' # tags tag_tags = soup.select('div .tags-body a') tags = [t.text for t in tag_tags] # get directors director_tags = soup.select('div #info > span a[rel="v:directedBy"]') p1 = re.compile(r'<[^>]+>(?P<director>[^<]+)</a>') directors = [p1.match(str(t)).group('director') for t in director_tags] # get stars star_tags = soup.select('div #info > span a[rel="v:starring"]') p2 = re.compile(r'<[^>]+>(?P<star>[^<]+)</a>') casts = [p2.match(str(t)).group('star') for t in star_tags] # get writers writers_tags = soup.select('div #info > span')[1].select('a') p2 = re.compile(r'<[^>]+>(?P<writer>[^<]+)</a>') writers = [p2.match(str(t)).group('writer') for t in writers_tags] # get genre genre_tags = soup.select('div #info > span[property="v:genre"]') p3 = re.compile(r'<span property="v:genre">(?P<genre>[^<]+)</span>') genres = [p3.match(str(t)).group('genre') for t in genre_tags] # get release date pubdate_tag = soup.select( 'div #info > span[property="v:initialReleaseDate"]') f4 = 0 if pubdate_tag: p41 = re.compile( r'<[^>]+>(?P<pubdate>[^(]+)[(]中国大陆([ ]3D)*[)]<[^>]+>') p42 = re.compile( r'<[^>]+>(?P<pubdate>[^(]+)[(]中国内地([ ]3D)*[)]<[^>]+>') p43 = re.compile( r'<[^>]+>(?P<pubdate>[^(]+)[(]香港([ ]3D)*[)]<[^>]+>') p44 = re.compile(r'[0-9-]+') for t in pubdate_tag: m = p41.search(str(t)) if m != None: f4 = 1 pubdate = m.group('pubdate') break m = p42.search(str(t)) if m != None: f4 = 1 pubdate = m.group('pubdate') break m = p43.search(str(t)) if m != None: f4 = 1 pubdate = m.group('pubdate') break m = p44.search(str(t)) if m != None: f4 = 1 pubdate = m.group() break if f4 == 0: self.logger.critical('{0} has no pubdate'.format(sid)) pubdate = year # append month/date if just year is known if len(pubdate) == 4: pubdate = pubdate + "-6-30" elif len(pubdate) == 7: pubdate = pubdate + "-15" pubdate = datetime.strptime(pubdate, '%Y-%m-%d') if not year: year = pubdate.strftime('%Y') # get wishes wishes_tags = soup.select( 'div #subject-others-interests > .subject-others-interests-ft > a') #print wishes_tags if len(wishes_tags) == 0: self.logger.critical('{0} donnot have wish count'.format(sid)) wish_count = None collect_count = None for i in range(len(wishes_tags)): m = re.match(u'(?P<wishes>[0-9]+)人想看', wishes_tags[i].text) if m: wish_count = m.group('wishes') continue m = re.match(u'(?P<collections>[0-9]+)人看过', wishes_tags[i].text) if m: collect_count = m.group('collections') rating_num = soup.select(r'strong.rating_num')[0].text if not rating_num: rating_num = None rating_lvls = soup.select(r'div.ratings-on-weight span.rating_per') if rating_lvls: rating_lvls = [float(r.text[:-1]) for r in rating_lvls] # season season_tags = soup.select('div #info select#season]') if season_tags: movie.seasons_count = season_tags.count movie.current_season = season_tags[0].select( 'option[selected]')[0].text photo_url = soup.select('a[class="nbgnbg"] img')[0].attrs['src'] #region save movie def parseNumber(v): m = re.findall('(\d+).*', v) if m: return int(m[0]) else: # parse chinese return convert(v.strip()) info_map = { u'制片国家/地区': { 'field': 'countries' }, u'语言': { 'field': 'languages' }, u'集数': { 'field': 'episodes_count', 'func': parseNumber }, u'单集片长': { 'field': 'duration', 'func': parseNumber }, u'片长': { 'field': 'duration', 'func': parseNumber }, u'又名': { 'field': 'aka', 'func': lambda v: v.split('/') }, u'IMDb链接': { 'field': 'imdb_id' } } info_str = soup.select('div #info')[0].text for k, f in info_map.items(): v = re.findall(k + "\:(.*)", info_str, re.MULTILINE) if v: func = (lambda s: s.strip()) \ if 'func' not in f \ else f['func'] f_val = func(v[0].strip()) setattr(movie, f['field'], f_val) movie.sid = sid movie.title = title movie.photo_alt = photo_url movie.year = year movie.summary = summary movie.tags = tags movie.subtype = subtype movie.directors = directors movie.casts = casts movie.writers = writers if rating_num: movie.rating = float(rating_num) if rating_lvls: movie.high_rating_pct = rating_lvls[0] + rating_lvls[1] movie.low_rating_pct = rating_lvls[3] + rating_lvls[4] if wish_count: movie.wish_count = wish_count if collect_count: movie.collect_count = collect_count movie.pubdate = pubdate movie.genres = genres movie.alt = url movie.last_update = datetime.now() movie.save() def _is_same(out_url, url): return out_url.rsplit('#', 1)[0] == url next_urls = soup.select("div.recommendations-bd a") for link in next_urls: out_url = link.attrs['href'] if not _is_same(out_url, url) and out_url.startswith( "https://movie.douban.com/subject"): sid_next = self.get_subject_id(out_url) if sid_next != sid: yield out_url
class WikiParser(Parser): def __init__(self, opener=None, url=None, **kw): super(WikiParser, self).__init__(opener=opener, url=url, **kw) if self.opener is None: self.opener = MechanizeOpener() self.html_comment_reg = re.compile(r'<!--[^-]+-->', re.DOTALL) self.en_time_reg = re.compile( r'\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}') self.zh_time_reg = re.compile( ur'\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}') def store(self, title, content, last_update): try: doc = WikiDocument.objects.get(title=title) if last_update > doc.last_update: doc.content = content doc.last_update = last_update doc.update(upsert=True) except DoesNotExist: doc = WikiDocument(title=title, content=content, last_update=last_update) doc.save() def _extract(self, soup): if soup.head is None: return None, None, None title = soup.head.title.text if '-' in title: title = title.split('-')[0].strip() content = soup.find('div', attrs={ 'id': 'mw-content-text', 'class': 'mw-content-ltr' }) while content.table is not None: content.table.extract() content = content.text last_update_str = soup.find('li', attrs={ 'id': 'footer-info-lastmod' }).text last_update = None match_en_time = self.en_time_reg.search(last_update_str) if match_en_time: last_update = match_en_time.group() last_update = parse(last_update) match_zh_time = self.zh_time_reg.search(last_update_str) if match_zh_time: last_update = match_zh_time.group() last_update = re.sub(r'\([^\)]+\)\s', '', last_update) last_update = last_update.replace(u'年', '-').replace(u'月', '-').replace( u'日', '') last_update = parse(last_update) if last_update is None: last_update = datetime.now() return title, content, last_update def parse(self, url=None): url = url or self.url lang = url.strip('http://').split('.', 1)[0] br = self.opener.browse_open(url) html = br.response().read() html = self.html_comment_reg.sub('', html) soup = BeautifulSoup(html) title, content, last_update = self._extract(soup) if not title: return title = title + ' ' + lang self.store(title, content, last_update) def _is_same(out_url, url): return out_url.rsplit('#', 1)[0] == url for link in br.links(): q = urlparse.urlparse(link.url) if q.scheme in ['http', 'https']: out_url = link.url if not _is_same(out_url, url): yield out_url elif not q.scheme: out_url = urlparse.urljoin(link.base_url, link.url) if not _is_same(out_url, url): yield out_url
def setUp(self): self.base_url = 'http://zhidao.baidu.com' self.url = 'http://zhidao.baidu.com/question/559110619.html' self.html = MechanizeOpener().open(self.url)
import os import urllib2 import re import urlparse from bs4 import BeautifulSoup from cola.core.opener import MechanizeOpener url = 'http://commons.wikimedia.org/wiki/File:Aerial_View_of_Trout_Lake.JPG' #url = 'http://commons.wikimedia.org/wiki/File:Capturing_the_rain_water_falling_from_roof.jpg' br = MechanizeOpener().browse_open(url) html = br.response().read() #print html soup = BeautifulSoup(html) def saveImg(picurl): local_path = '/data/test/' names = picurl.split('/') picname = names[-1] print picname #name = re.match(pattern,picurl) #print name print 'downing',picurl #filename = local_path + name.group() filename = local_path + picname print filename #print picurl try: response = urllib2.urlopen(picurl,timeout=10) cont = response.read() except urllib2.URLError,e: print e.reason # cont = MechanizeOpener().browse_open(picurl).read()