def get(self): content = self.request.content soup = BeautifulSoup(''.join(content)) #所有text已经被自动转为unicode,如果需要,可以自行转码encode(xxx) title = soup.html.body.h1 if not title: return title = title.text subtitle = soup.findAll(attrs={'class':'f_cy f_s16b'})[0].string description = soup.find(attrs={'class':'f_cy f_s14 pt20'}) description = description.text if description else '' smooth_index = soup.findAll(attrs={'class':'pt20'})[0] smooth_index = smooth_index.text if smooth_index else '' information = soup.findAll(attrs={'class':'pt20'})[1] information = information.text if information else '' tips = soup.find(attrs={'class':'f_s14 pt20'}) tips = tips.text + tips.nextSibling.nextSibling.text if tips else '' # pics = soup.findAll('a', href = re.compile(r'pic\d')) pics = soup.findAll(attrs={'class':'pic1'}) if pics: imageList = [] for pic in pics: img = pic.find('img')['src'] imageList.append(img) spider.put(HTTP%img) self.page.append((self.request.url, title, subtitle, description, smooth_index, information, tips, imageList))
def get(self): content = self.request.content soup = BeautifulSoup(''.join(content)) result = soup.find(attrs={'class':'centerPadd'}).find(attrs={'class':'clearfix goodsBox'}).findAll('div') for div in result: link = div.a['href'] print 'Fetch condom from ' + link spider.put(link)
def get(self): content = self.request.content soup = BeautifulSoup(''.join(content)) result = soup.find(attrs={'class':'products'}).findAll('li') for li in result: link = li.p.a['href'] print 'Fetch condom from ' + link spider.put(HTTP%link)
def get(self): content = self.request.content soup = BeautifulSoup("".join(content)) result = soup.findAll(attrs={"class": "title"}) for li in result: link = li["href"] link = link.replace("price", "detail") print "Fetch condom detail from: " + link spider.put(HTTP % link)
def get(self): id_list = [] for link in self.extract_all('<a href="/tags/', '</a>'): tag_id = link.split('.')[0] tag_name = link.split('>')[1] tag_names[int(tag_id)] = tag_name id_list.append(tag_id) for t_id in id_list: newlink = 'tags/' + t_id + '.html' spider.put(HTTP%newlink)
def get(self): content = self.request.content soup = BeautifulSoup(''.join(content)) title = soup.html.head.title print title.text results = soup.find(attrs={'class':'searchListOne'}).findAll('li') for li in results: if li.div != None: print li.div.h3.a.text link = li.div.h3.a['href'] print link spider.put(link)
def get(self): for link in self.extract_all('<div><a title=', '</a>'): linke_name_res = self.re_link_and_name.match(link) if linke_name_res: volume_link = linke_name_res.group(1) volume_name = linke_name_res.group(2) print volume_link, volume_name VOLUMES_NAME_DICT[volume_link] = volume_name if not os.path.exists(volume_name): os.mkdir(volume_name) spider.put(volume_link) break else: pass
def get(self): for link in self.extract_all('<li class="js-tag-w"><a href="', '</a>'): # filtering hot tags which have target label if link.find('target') == -1: sp_result = link.split('">') if len(sp_result) != 2: continue link_href = sp_result[0] link_name = sp_result[1] # Lazy... Just using url as key, tag id should be used. tag_url = HTTP % link_href TAG_ARTICLE_DICT[tag_url] = { 'tag_name': link_name, 'article_list': [] } spider.put(tag_url)
def get (self, category=None, _x=None): soup = BeautifulSoup (self.html) links = soup.find_all ('a', {'class':'readmore'}) for link_e in links: link = link_e.get ('href') print 'link', link spider.put (link) page_index = soup.find ('div', {'class': 'pageNavi'}) if page_index: for link_e in page_index.find_all ('a'): link = link_e.get ('href') if link: if link.find (site) == -1: link = "http://%s%s" % (site, link) if not self.indexes.has_key (link): print 'link_index', link self.indexes[link] = None spider.put (link)
def get(self): content = self.request.content soup = BeautifulSoup("".join(content)) result = soup.find(attrs={"class": "leftinnersubmenu sidebar"}).findAll("li") for li in result: link = li.a["href"] number = filter(lambda ch: ch in "0123456789", li.text) print "Condom number:" + number page_size = 20 page_num = 0 if int(number) % page_size != 0: page_num = int(number) / page_size + 1 else: page_num = int(number) / page_size for pn in range(page_num): page = pn + 1 url = link + "pn" + str(page) print "Fetch condom from: " + HTTP % url spider.put(HTTP % url)
def get(self): url = str(self.url) r = re.compile(r'\d+') tag_id = int(r.findall(url)[0]) # 获取文章标题 names = self.extract_all('<h3><a href="/article/', '</a></h3>') for n in names: title = n.split('>')[1] if tag_id not in tag_info.keys(): tag_info[tag_id] = [title] else: tag_info[tag_id].append(title) # 分页跳转,如果有下一页就继续进行抓取,没有下一页则进行打印 pages = self.extract('<span class="next"><a href="http://www.huxiu.com/tags/', '.html">') if pages: link = "tags/%d/%d.html" % (int(pages.split('/')[0]), int(pages.split('/')[1])) spider.put(HTTP%link) else: self.show_title(tag_id)
def readHtml(): file = join(PREFIX, "yeedou.html") with open(file, "r") as f: html = f.read() f.close() soup = BeautifulSoup(html) result = soup.findAll(attrs={"class": "panel_tabalphabet"}) brand_list = [] for tab in result: li_list = tab.findAll("li") for li in li_list: link = li.a["href"] number = filter(lambda ch: ch in "0123456789", li.text) bracket_index = li.text.index("(") brand_name = li.text[0:bracket_index] brand_list.append(brand_name + " " + number) print brand_name + ":" + number with open(join(PREFIX, "brands.txt"), "w") as file: for brand in brand_list: file.write("%s\n" % brand) file.close() page_size = 20 page_num = 0 if int(number) % page_size != 0: page_num = int(number) / page_size + 1 else: page_num = int(number) / page_size for pn in range(page_num): page = pn + 1 url = link + "pn" + str(page) print "Fetch data from: " + HTTP % url spider.put(HTTP % url)
def get(self): now_id = int(self.get_argument("id", 0)) page = int(self.get_argument("pi", 0)) if now_id: for link in self.extract_all('<h3 class="nickname">','</h3>'): link = extract('"/','"', link) spider.put("http://xianguo.com/"+link) if page == 0: page_list = set(self.extract_all("href=\"/find/recommend?pi=","&")) for i in map(int,page_list): if page: spider.put("http://xianguo.com/find/recommend?id=%s&pi=%s"%(now_id,page)) else: for id in self.extract_all( 'href="/find/recommend?id=', '"' ): spider.put("http://xianguo.com/find/recommend?id=%s&pi=0"%id)
def get(self): now_id = int(self.get_argument("id", 0)) page = int(self.get_argument("pi", 0)) if now_id: for link in self.extract_all('<h3 class="nickname">', '</h3>'): link = extract('"/', '"', link) spider.put("http://xianguo.com/" + link) if page == 0: page_list = set( self.extract_all("href=\"/find/recommend?pi=", "&")) for i in map(int, page_list): if page: spider.put( "http://xianguo.com/find/recommend?id=%s&pi=%s" % (now_id, page)) else: for id in self.extract_all('href="/find/recommend?id=', '"'): spider.put("http://xianguo.com/find/recommend?id=%s&pi=0" % id)
def get(self): for link in self.extract_all('<dt class="xs2"><a href="', '"'): spider.put(HTTP%link)
def get(self): link = self.extract('class="pn" href="', '" target=""> 中英对照') spider.put(HTTP % link)
html = html[:html.rfind('</div>')] tid = int(self.get_argument('tid')) print tid, name self.page.append((tid, self.request.url, name, html)) @classmethod def write(cls): page = cls.page page.sort(key=itemgetter(0), reverse=True) with open(join(PREFIX, 'ecocn_org.xml'), 'w') as rss: rss.write( cls.template.render( rss_title='经济学人 . 中文网', rss_link='http://www.ecocn.org', li=[ dict( link=link, title=title, txt=txt ) for id, link, title, txt in cls.page ] ) ) if __name__ == '__main__': spider.put('http://www.ecocn.org/portal.php?mod=list&catid=1') #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(3, 30) forum.write()
def get(self): name = self.extract('id="thread_subject">', '</a>') if not name: return name = name.split(']', 1)[-1].strip() html = self.extract('<div class="t_fsz">', '<div id="comment_') html = html[:html.rfind('</div>')] tid = int(self.get_argument('tid')) print tid, name self.page.append((tid, self.request.url, name, html)) @classmethod def write(cls): page = cls.page page.sort(key=itemgetter(0), reverse=True) with open(join(PREFIX, 'ecocn_org.xml'), 'w') as rss: rss.write( cls.template.render(rss_title='经济学人 . 中文网', rss_link='http://www.ecocn.org', li=[ dict(link=link, title=title, txt=txt) for id, link, title, txt in cls.page ])) if __name__ == '__main__': spider.put('http://www.ecocn.org/portal.php?mod=list&catid=1') #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30)
def get(self): for link in self.extract_all('<dt class="xs2"><a href="', '"'): spider.put(HTTP % link)
title = n.split('>')[1] if tag_id not in tag_info.keys(): tag_info[tag_id] = [title] else: tag_info[tag_id].append(title) # 分页跳转,如果有下一页就继续进行抓取,没有下一页则进行打印 pages = self.extract('<span class="next"><a href="http://www.huxiu.com/tags/', '.html">') if pages: link = "tags/%d/%d.html" % (int(pages.split('/')[0]), int(pages.split('/')[1])) spider.put(HTTP%link) else: self.show_title(tag_id) def show_title(self, tag_id): if tag_id not in tag_info.keys(): tag_info[tag_id] = [] print tag_names[tag_id] i = 1 for item in tag_info[tag_id]: print '\t%d' % i, item i += 1 raw_input() if __name__ == '__main__': spider.put('http://www.huxiu.com/tagslist/all.html') #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30)
# coding:utf-8 from spider.spider import route, Handler, spider import _env from os.path import abspath, dirname, join from operator import itemgetter PREFIX = join(dirname(abspath(__file__))) @route("/site/(\d+).html") class portal(Handler): def get(self, id): h2 = self.extract("<h2>", "</h2>") link = self.extract('<p>链接: <a href="', '"') h3 = self.extract("<h3>日志列表(", ")</h3>") if h2: print h3, link, h2 if __name__ == "__main__": URL = "http://www.udpwork.com/site/%s.html" for i in xrange(2, 240): spider.put(URL % i) # 10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30)
array_files = output.split('|') return array_files def get_volume_name(self): volume_name = VOLUMES_NAME_DICT[self.request.url] return volume_name def get(self): for segment in self.extract_all('<script>var s_files=\"', '</script>'): segment_list = segment.split('";') s_files = segment_list[0] # http://www.blgl8.com/script/ds/ds.js s_ds = 'http://comic.1mh.in:2813/' s_path = segment_list[1].split('="')[1] array_files = self.get_array_files(s_files) volume_name = self.get_volume_name() for pic_file in array_files: # print volume_name + s_ds + s_path + pic_file wget_command = 'wget ' + s_ds + s_path + pic_file + \ " -P " + volume_name print wget_command # status, output = commands.getstatusoutput(wget_command) # print status, output if __name__ == '__main__': # spider.put('http://www.blgl8.com/comic-i/blgl44469/') spider.put('http://www.blgl8.com/comic-i/blgl39895/') spider.run(1, 5)
from spider.spider import spider, route, Handler from os.path import join, dirname, abspath import _env from BeautifulSoup import BeautifulSoup import pymongo PREFIX = join(dirname(abspath(__file__))) HTTP = 'http://aiailove.cn/%s' @route('/category-29-b0.html') class goodlist(Handler): def get(self): content = self.request.content soup = BeautifulSoup(''.join(content)) result = soup.find(attrs={'class':'centerPadd'}).find(attrs={'class':'clearfix goodsBox'}).findAll('div') for div in result: link = div.a['href'] print 'Fetch condom from ' + link spider.put(link) @route('/goods-\d+.html') class goodItem(Handler): def get(self): content = self.request.content soup = BeautifulSoup(''.join(content)) if __name__ == '__main__': spider.put('http://aiailove.cn/category-29-b0.html') spider.run(5,100)
def get(self): link = self.extract( 'class="pn" href="', '" target=""> 中英对照') spider.put(HTTP%link)
#coding:utf-8 from spider.spider import route, Handler, spider, extract import _env from os.path import abspath, dirname, join from operator import itemgetter from html2txt import html2txt @route("/page/(\d+)/") class _(Handler): def get(self,page): for html in self.extract_all('<h2><a target="_blank" href="http://blog.jobbole.com', '<!-- .entry-content -->'): id = html[:html.find('"')] title = extract('/','<',html).split(">",1)[-1] link_html = extract('<div class="entry-content">','</p>', html) link_html = extract('<p', None, link_html) txt = html2txt(link_html) if "http://" in txt: print "http://blog.jobbole.com%s"%id print title print txt print "" if __name__ == '__main__': for i in xrange(1,159): spider.put('http://blog.jobbole.com/page/%s/'%i) #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30)
HTTP = 'http://www.tianya.cn%s' @route('/search/bbs') class search_page(Handler): def get(self): content = self.request.content soup = BeautifulSoup(''.join(content)) title = soup.html.head.title print title.text results = soup.find(attrs={'class':'searchListOne'}).findAll('li') for li in results: if li.div != None: print li.div.h3.a.text link = li.div.h3.a['href'] print link spider.put(link) @route('/post-\w+-\w+-\w\.shtml') class article(Handler): def get(self): content = self.request.content soup = BeautifulSoup(''.join(content)) content = soup.find(attrs = {'class':'bbs-content clearfix'}).text print content if __name__ == '__main__': spider.put('http://www.tianya.cn/search/bbs?q=安全套&pn=1') spider.run(5,100)
@route('/tags/\d+\.html') class Articles(Handler): def get(self): for link in self.extract_all('<h3><a href="', '</a>'): article_result = link.split('">') if len(article_result) != 2: continue article_name = article_result[1] tag_url = self.get_tag_url() # tag_url must be in TAG_ARTICLE_DICT TAG_ARTICLE_DICT[tag_url]['article_list'].append(article_name) def get_tag_url(self): url = self.request.url return url def print_result(): for value in TAG_ARTICLE_DICT.values(): print value['tag_name'] for idx, article_name in enumerate(value['article_list']): print ' ', idx + 1, '.', article_name if __name__ == '__main__': spider.put('http://www.huxiu.com/tagslist/all.html') # 10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30) print_result()
if path == '/zaker/apps.php': data = data['data']['datas'] for i in data: print_i(i) else: if data['msg'] != 'ok':return data = data['data'] if 'articles' in data: for txt in data['articles']: if 'full_url' in txt: url = txt['full_url'] #spider.put( url) #print url else: print html2txt(data['content']) #http://iphone.myzaker.com/l.php?l=50b7f4a1497959972f00007e #a.myzaker.com/aa/201210/677/508f73234979593b55000023.htm? #@route('/(.*)\.htm') #class _(Handler): # def get(self, path): # print path if __name__ == '__main__': spider.put('http://iphone.myzaker.com/zaker/apps.php?act=getAllAppsData') #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30)
@route('/find/recommend') class _(Handler): def get(self): now_id = int(self.get_argument("id", 0)) page = int(self.get_argument("pi", 0)) if now_id: for link in self.extract_all('<h3 class="nickname">', '</h3>'): link = extract('"/', '"', link) spider.put("http://xianguo.com/" + link) if page == 0: page_list = set( self.extract_all("href=\"/find/recommend?pi=", "&")) for i in map(int, page_list): if page: spider.put( "http://xianguo.com/find/recommend?id=%s&pi=%s" % (now_id, page)) else: for id in self.extract_all('href="/find/recommend?id=', '"'): spider.put("http://xianguo.com/find/recommend?id=%s&pi=0" % id) if __name__ == '__main__': URL = 'http://xianguo.com/find/recommend' spider.put(URL) #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30)
import _env from os.path import abspath, dirname, join from operator import itemgetter from html2txt import html2txt @route("/page/(\d+)/") class _(Handler): def get(self, page): for html in self.extract_all( '<h2><a target="_blank" href="http://blog.jobbole.com', '<!-- .entry-content -->'): id = html[:html.find('"')] title = extract('/', '<', html).split(">", 1)[-1] link_html = extract('<div class="entry-content">', '</p>', html) link_html = extract('<p', None, link_html) txt = html2txt(link_html) if "http://" in txt: print "http://blog.jobbole.com%s" % id print title print txt print "" if __name__ == '__main__': for i in xrange(1, 159): spider.put('http://blog.jobbole.com/page/%s/' % i) #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30)
doc_id = collection.insert(item) # add a document to the index solrConnection.add(id = doc_id, title = item.get('title'), description = item.get('description'), subtitle = item.get('subtitle'), information = item.get('information')) # commit to solr solrConnection.commit() @route('/images/.+') class pic(Handler): def get(self): save_pic(self.html, route.path.split('/')[-1]) @route('/wp-content/uploads/.+') class pic2(Handler): def get(self): save_pic(self.html, route.path.split('/')[-1]) def save_pic(content, fname): basepath = join(PREFIX, 'images') fpath = join(basepath, fname) f = open(fpath, 'wb') f.write(content) f.close() print 'Download image: ' + fname if __name__ == '__main__': spider.put('http://www.durex.com.cn/products') spider.run(5,100) item.writedb()
@route('/find/recommend') class _(Handler): def get(self): now_id = int(self.get_argument("id", 0)) page = int(self.get_argument("pi", 0)) if now_id: for link in self.extract_all('<h3 class="nickname">','</h3>'): link = extract('"/','"', link) spider.put("http://xianguo.com/"+link) if page == 0: page_list = set(self.extract_all("href=\"/find/recommend?pi=","&")) for i in map(int,page_list): if page: spider.put("http://xianguo.com/find/recommend?id=%s&pi=%s"%(now_id,page)) else: for id in self.extract_all( 'href="/find/recommend?id=', '"' ): spider.put("http://xianguo.com/find/recommend?id=%s&pi=0"%id) if __name__ == '__main__': URL = 'http://xianguo.com/find/recommend' spider.put(URL) #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30)
if link.find (site) == -1: link = "http://%s%s" % (site, link) if not self.indexes.has_key (link): print 'link_index', link self.indexes[link] = None spider.put (link) template = unicode("""--- title: %s date: %s categories: %s --- """, 'utf-8') if __name__ == '__main__': spider.put ("http://%s/index_1.html" % (site)) # spider.put ("http://frostyplanet.blogbus.com/c1566502/") spider.run (5, 5) if not os.path.exists (os.path.join (PREFIX, "output")): os.mkdir ("output") for _id, title, times, category, html in blog_post.posts: file_path = os.path.join (os.path.join (PREFIX, "output", str(category) + "_" + str(_id))) content = template % (title, times, category) + unicode(html, 'utf-8') f = open (file_path, "w") try: f.write (content.encode ('utf-8')) finally: f.close () print "wrote", file_path
#coding:utf-8 from spider.spider import route, Handler, spider import _env from os.path import abspath, dirname, join from operator import itemgetter PREFIX = join(dirname(abspath(__file__))) @route('/snap/(\d+)') class portal(Handler): def get(self, id): link = self.extract('源地址:<a href="', '"') #name = self.extract('作者:',' | ') title = self.extract('<h1 style="margin-bottom:0;">', '</h1>') if link: print id, link, title.replace("\n", " ") if __name__ == '__main__': URL = "http://ucdchina.com/snap/%s" for i in xrange(1250, 12460 + 1): spider.put(URL % i) #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30)