Example #1
0
    def get(self):
        content = self.request.content
        soup = BeautifulSoup(''.join(content))

        #所有text已经被自动转为unicode,如果需要,可以自行转码encode(xxx)
        title = soup.html.body.h1
        if not title:
            return
        title = title.text
        subtitle = soup.findAll(attrs={'class':'f_cy f_s16b'})[0].string
        description = soup.find(attrs={'class':'f_cy f_s14 pt20'})
        description = description.text if description else ''
        smooth_index = soup.findAll(attrs={'class':'pt20'})[0]
        smooth_index = smooth_index.text if smooth_index else ''
        information = soup.findAll(attrs={'class':'pt20'})[1]
        information = information.text if information else ''
        tips = soup.find(attrs={'class':'f_s14 pt20'})
        tips = tips.text + tips.nextSibling.nextSibling.text if tips else ''

#        pics = soup.findAll('a', href = re.compile(r'pic\d'))
        pics = soup.findAll(attrs={'class':'pic1'})
        if pics:
            imageList = []
            for pic in pics:
                img = pic.find('img')['src']
                imageList.append(img)
                spider.put(HTTP%img)

        self.page.append((self.request.url, title, subtitle, description, smooth_index, information, tips, imageList))
Example #2
0
 def get(self):
     content = self.request.content
     soup = BeautifulSoup(''.join(content))
     result = soup.find(attrs={'class':'centerPadd'}).find(attrs={'class':'clearfix goodsBox'}).findAll('div')
     for div in result:
         link = div.a['href']
         print 'Fetch condom from ' + link
         spider.put(link)
Example #3
0
 def get(self):
     content = self.request.content
     soup = BeautifulSoup(''.join(content))
     result = soup.find(attrs={'class':'products'}).findAll('li')
     for li in result:
         link = li.p.a['href']
         print 'Fetch condom from ' + link
         spider.put(HTTP%link)
Example #4
0
 def get(self):
     content = self.request.content
     soup = BeautifulSoup("".join(content))
     result = soup.findAll(attrs={"class": "title"})
     for li in result:
         link = li["href"]
         link = link.replace("price", "detail")
         print "Fetch condom detail from: " + link
         spider.put(HTTP % link)
Example #5
0
 def get(self):
     id_list = []
     for link in self.extract_all('<a href="/tags/', '</a>'):
         tag_id = link.split('.')[0]
         tag_name = link.split('>')[1]
         tag_names[int(tag_id)] = tag_name
         id_list.append(tag_id)
     for t_id in id_list:
         newlink = 'tags/' + t_id + '.html'
         spider.put(HTTP%newlink)
Example #6
0
 def get(self):
     content = self.request.content
     soup = BeautifulSoup(''.join(content))
     title = soup.html.head.title
     print title.text
     results = soup.find(attrs={'class':'searchListOne'}).findAll('li')
     for li in results:
         if li.div != None:
             print li.div.h3.a.text
             link = li.div.h3.a['href']
             print link
             spider.put(link)
Example #7
0
 def get(self):
     for link in self.extract_all('<div><a title=', '</a>'):
         linke_name_res = self.re_link_and_name.match(link)
         if linke_name_res:
             volume_link = linke_name_res.group(1)
             volume_name = linke_name_res.group(2)
             print volume_link, volume_name
             VOLUMES_NAME_DICT[volume_link] = volume_name
             if not os.path.exists(volume_name):
                 os.mkdir(volume_name)
             spider.put(volume_link)
             break
         else:
             pass
Example #8
0
    def get(self):
        for link in self.extract_all('<li class="js-tag-w"><a href="', '</a>'):
            # filtering hot tags which have target label
            if link.find('target') == -1:
                sp_result = link.split('">')

                if len(sp_result) != 2:
                    continue
                link_href = sp_result[0]
                link_name = sp_result[1]

                # Lazy... Just using url as key, tag id should be used.
                tag_url = HTTP % link_href
                TAG_ARTICLE_DICT[tag_url] = {
                    'tag_name': link_name,
                    'article_list': []
                }
                spider.put(tag_url)
Example #9
0
 def get (self, category=None, _x=None):
     soup = BeautifulSoup (self.html)
     links = soup.find_all ('a', {'class':'readmore'})
     for link_e in links:
         link = link_e.get ('href')
         print 'link', link
         spider.put (link)
     page_index = soup.find ('div', {'class': 'pageNavi'})
     if page_index:
         for link_e in page_index.find_all ('a'):
             link = link_e.get ('href')
             if link:
                 if link.find (site) == -1:
                     link = "http://%s%s" % (site, link)
                 if not self.indexes.has_key (link):
                     print 'link_index', link
                     self.indexes[link] = None
                     spider.put (link)
Example #10
0
 def get(self):
     content = self.request.content
     soup = BeautifulSoup("".join(content))
     result = soup.find(attrs={"class": "leftinnersubmenu sidebar"}).findAll("li")
     for li in result:
         link = li.a["href"]
         number = filter(lambda ch: ch in "0123456789", li.text)
         print "Condom number:" + number
         page_size = 20
         page_num = 0
         if int(number) % page_size != 0:
             page_num = int(number) / page_size + 1
         else:
             page_num = int(number) / page_size
         for pn in range(page_num):
             page = pn + 1
             url = link + "pn" + str(page)
             print "Fetch condom from: " + HTTP % url
             spider.put(HTTP % url)
Example #11
0
    def get(self):
        url = str(self.url)
        r = re.compile(r'\d+')
        tag_id = int(r.findall(url)[0])

        # 获取文章标题
        names = self.extract_all('<h3><a href="/article/', '</a></h3>')
        for n in names:
            title = n.split('>')[1]
            if tag_id not in tag_info.keys():
                tag_info[tag_id] = [title]
            else:
                tag_info[tag_id].append(title)

        # 分页跳转,如果有下一页就继续进行抓取,没有下一页则进行打印
        pages = self.extract('<span class="next"><a href="http://www.huxiu.com/tags/', '.html">')
        if pages:
            link = "tags/%d/%d.html" % (int(pages.split('/')[0]), int(pages.split('/')[1]))
            spider.put(HTTP%link)
        else:
            self.show_title(tag_id)
Example #12
0
def readHtml():
    file = join(PREFIX, "yeedou.html")
    with open(file, "r") as f:
        html = f.read()
    f.close()

    soup = BeautifulSoup(html)
    result = soup.findAll(attrs={"class": "panel_tabalphabet"})
    brand_list = []
    for tab in result:
        li_list = tab.findAll("li")
        for li in li_list:
            link = li.a["href"]

            number = filter(lambda ch: ch in "0123456789", li.text)
            bracket_index = li.text.index("(")
            brand_name = li.text[0:bracket_index]
            brand_list.append(brand_name + " " + number)
            print brand_name + ":" + number

            with open(join(PREFIX, "brands.txt"), "w") as file:
                for brand in brand_list:
                    file.write("%s\n" % brand)
            file.close()

            page_size = 20
            page_num = 0
            if int(number) % page_size != 0:
                page_num = int(number) / page_size + 1
            else:
                page_num = int(number) / page_size
            for pn in range(page_num):
                page = pn + 1
                url = link + "pn" + str(page)
                print "Fetch data from: " + HTTP % url
                spider.put(HTTP % url)
Example #13
0
 def get(self):
     now_id = int(self.get_argument("id", 0))
     page = int(self.get_argument("pi", 0))
     if now_id:
         for link in self.extract_all('<h3 class="nickname">','</h3>'):
             link = extract('"/','"', link)
             spider.put("http://xianguo.com/"+link)
         if page == 0:
             page_list = set(self.extract_all("href=\"/find/recommend?pi=","&"))
             for i in map(int,page_list):
                 if page:
                     spider.put("http://xianguo.com/find/recommend?id=%s&pi=%s"%(now_id,page))
     else:
         for id in self.extract_all(
             'href="/find/recommend?id=', '"'
         ):
             spider.put("http://xianguo.com/find/recommend?id=%s&pi=0"%id)
Example #14
0
 def get(self):
     now_id = int(self.get_argument("id", 0))
     page = int(self.get_argument("pi", 0))
     if now_id:
         for link in self.extract_all('<h3 class="nickname">', '</h3>'):
             link = extract('"/', '"', link)
             spider.put("http://xianguo.com/" + link)
         if page == 0:
             page_list = set(
                 self.extract_all("href=\"/find/recommend?pi=", "&"))
             for i in map(int, page_list):
                 if page:
                     spider.put(
                         "http://xianguo.com/find/recommend?id=%s&pi=%s" %
                         (now_id, page))
     else:
         for id in self.extract_all('href="/find/recommend?id=', '"'):
             spider.put("http://xianguo.com/find/recommend?id=%s&pi=0" % id)
Example #15
0
 def get(self):
     for link in self.extract_all('<dt class="xs2"><a href="', '"'):
         spider.put(HTTP%link)
Example #16
0
 def get(self):
     link = self.extract('class="pn" href="', '" target=""> 中英对照')
     spider.put(HTTP % link)
Example #17
0
        html = html[:html.rfind('</div>')]
        tid = int(self.get_argument('tid'))
        print tid, name
        self.page.append((tid, self.request.url, name, html))

    @classmethod
    def write(cls):
        page = cls.page
        page.sort(key=itemgetter(0), reverse=True)
        with open(join(PREFIX, 'ecocn_org.xml'), 'w') as rss:
            rss.write(
                cls.template.render(
                    rss_title='经济学人 . 中文网',
                    rss_link='http://www.ecocn.org',
                    li=[
                        dict(
                            link=link,
                            title=title,
                            txt=txt
                        ) for id, link, title, txt in cls.page
                    ]
                )
            )

if __name__ == '__main__':
    spider.put('http://www.ecocn.org/portal.php?mod=list&catid=1')
    #10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(3, 30)
    forum.write()

Example #18
0
    def get(self):
        name = self.extract('id="thread_subject">', '</a>')
        if not name:
            return
        name = name.split(']', 1)[-1].strip()
        html = self.extract('<div class="t_fsz">', '<div id="comment_')
        html = html[:html.rfind('</div>')]
        tid = int(self.get_argument('tid'))
        print tid, name
        self.page.append((tid, self.request.url, name, html))

    @classmethod
    def write(cls):
        page = cls.page
        page.sort(key=itemgetter(0), reverse=True)
        with open(join(PREFIX, 'ecocn_org.xml'), 'w') as rss:
            rss.write(
                cls.template.render(rss_title='经济学人 . 中文网',
                                    rss_link='http://www.ecocn.org',
                                    li=[
                                        dict(link=link, title=title, txt=txt)
                                        for id, link, title, txt in cls.page
                                    ]))


if __name__ == '__main__':
    spider.put('http://www.ecocn.org/portal.php?mod=list&catid=1')
    #10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(10, 30)
Example #19
0
 def get(self):
     for link in self.extract_all('<dt class="xs2"><a href="', '"'):
         spider.put(HTTP % link)
Example #20
0
            title = n.split('>')[1]
            if tag_id not in tag_info.keys():
                tag_info[tag_id] = [title]
            else:
                tag_info[tag_id].append(title)

        # 分页跳转,如果有下一页就继续进行抓取,没有下一页则进行打印
        pages = self.extract('<span class="next"><a href="http://www.huxiu.com/tags/', '.html">')
        if pages:
            link = "tags/%d/%d.html" % (int(pages.split('/')[0]), int(pages.split('/')[1]))
            spider.put(HTTP%link)
        else:
            self.show_title(tag_id)

    def show_title(self, tag_id):
        if tag_id not in tag_info.keys():
            tag_info[tag_id] = []
        print tag_names[tag_id]
        i = 1
        for item in tag_info[tag_id]:
            print '\t%d' % i, item
            i += 1
        raw_input()


if __name__ == '__main__':
    spider.put('http://www.huxiu.com/tagslist/all.html')
    #10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(10, 30)

Example #21
0
# coding:utf-8
from spider.spider import route, Handler, spider
import _env
from os.path import abspath, dirname, join
from operator import itemgetter

PREFIX = join(dirname(abspath(__file__)))


@route("/site/(\d+).html")
class portal(Handler):
    def get(self, id):
        h2 = self.extract("<h2>", "</h2>")
        link = self.extract('<p>链接: <a href="', '"')
        h3 = self.extract("<h3>日志列表(", ")</h3>")

        if h2:
            print h3, link, h2


if __name__ == "__main__":

    URL = "http://www.udpwork.com/site/%s.html"
    for i in xrange(2, 240):
        spider.put(URL % i)

    # 10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(10, 30)
Example #22
0
            array_files = output.split('|')
            return array_files

    def get_volume_name(self):
        volume_name = VOLUMES_NAME_DICT[self.request.url]
        return volume_name

    def get(self):
        for segment in self.extract_all('<script>var s_files=\"', '</script>'):
            segment_list = segment.split('";')
            s_files = segment_list[0]
            # http://www.blgl8.com/script/ds/ds.js
            s_ds = 'http://comic.1mh.in:2813/'
            s_path = segment_list[1].split('="')[1]
            array_files = self.get_array_files(s_files)
            volume_name = self.get_volume_name()

            for pic_file in array_files:
                # print volume_name + s_ds + s_path + pic_file
                wget_command = 'wget ' + s_ds + s_path + pic_file + \
                               " -P " + volume_name
                print wget_command
                # status, output = commands.getstatusoutput(wget_command)
                # print status, output


if __name__ == '__main__':
    # spider.put('http://www.blgl8.com/comic-i/blgl44469/')
    spider.put('http://www.blgl8.com/comic-i/blgl39895/')
    spider.run(1, 5)
Example #23
0
from spider.spider import spider, route, Handler
from os.path import join, dirname, abspath
import _env
from BeautifulSoup import BeautifulSoup
import pymongo

PREFIX = join(dirname(abspath(__file__)))
HTTP = 'http://aiailove.cn/%s'

@route('/category-29-b0.html')
class goodlist(Handler):
    def get(self):
        content = self.request.content
        soup = BeautifulSoup(''.join(content))
        result = soup.find(attrs={'class':'centerPadd'}).find(attrs={'class':'clearfix goodsBox'}).findAll('div')
        for div in result:
            link = div.a['href']
            print 'Fetch condom from ' + link
            spider.put(link)

@route('/goods-\d+.html')
class goodItem(Handler):
    def get(self):
        content = self.request.content
        soup = BeautifulSoup(''.join(content))

if __name__ == '__main__':
    spider.put('http://aiailove.cn/category-29-b0.html')
    spider.run(5,100)

Example #24
0
 def get(self):
     link = self.extract( 'class="pn" href="', '" target=""> 中英对照')
     spider.put(HTTP%link)
Example #25
0
#coding:utf-8
from spider.spider import route, Handler, spider, extract
import _env
from os.path import abspath, dirname, join
from operator import itemgetter
from html2txt import html2txt

@route("/page/(\d+)/")
class _(Handler):
    def get(self,page):
        for html in self.extract_all('<h2><a  target="_blank" href="http://blog.jobbole.com', '<!-- .entry-content -->'):
            id = html[:html.find('"')] 
            title = extract('/','<',html).split(">",1)[-1]
            link_html = extract('<div class="entry-content">','</p>', html)
            link_html = extract('<p', None, link_html)
            txt = html2txt(link_html)
            if "http://" in txt:
                print "http://blog.jobbole.com%s"%id
                print title
                print txt  
                print ""

if __name__ == '__main__':
    for i in xrange(1,159):
        spider.put('http://blog.jobbole.com/page/%s/'%i)

    #10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(10, 30)

Example #26
0
HTTP = 'http://www.tianya.cn%s'

@route('/search/bbs')
class search_page(Handler):
    def get(self):
        content = self.request.content
        soup = BeautifulSoup(''.join(content))
        title = soup.html.head.title
        print title.text
        results = soup.find(attrs={'class':'searchListOne'}).findAll('li')
        for li in results:
            if li.div != None:
                print li.div.h3.a.text
                link = li.div.h3.a['href']
                print link
                spider.put(link)


@route('/post-\w+-\w+-\w\.shtml')
class article(Handler):
    def get(self):
        content = self.request.content
        soup = BeautifulSoup(''.join(content))
        content = soup.find(attrs = {'class':'bbs-content clearfix'}).text
        print content

if __name__ == '__main__':
    spider.put('http://www.tianya.cn/search/bbs?q=安全套&pn=1')
    spider.run(5,100)

Example #27
0
@route('/tags/\d+\.html')
class Articles(Handler):
    def get(self):
        for link in self.extract_all('<h3><a href="', '</a>'):
            article_result = link.split('">')

            if len(article_result) != 2:
                continue
            article_name = article_result[1]
            tag_url = self.get_tag_url()
            # tag_url must be in TAG_ARTICLE_DICT
            TAG_ARTICLE_DICT[tag_url]['article_list'].append(article_name)

    def get_tag_url(self):
        url = self.request.url
        return url


def print_result():
    for value in TAG_ARTICLE_DICT.values():
        print value['tag_name']
        for idx, article_name in enumerate(value['article_list']):
            print ' ', idx + 1, '.', article_name


if __name__ == '__main__':
    spider.put('http://www.huxiu.com/tagslist/all.html')
    # 10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(10, 30)
    print_result()
Example #28
0
        if path == '/zaker/apps.php':
            data = data['data']['datas']
            for i in data:
                print_i(i)
        else:
            if data['msg'] != 'ok':return
            data = data['data']
            if 'articles' in data:
                for txt in data['articles']:
                    if 'full_url' in txt:
                        url = txt['full_url']
                        #spider.put( url)
                        #print url 
            else:
                print html2txt(data['content'])


#http://iphone.myzaker.com/l.php?l=50b7f4a1497959972f00007e
#a.myzaker.com/aa/201210/677/508f73234979593b55000023.htm?
#@route('/(.*)\.htm')
#class _(Handler):
#    def get(self, path):
#        print path 


if __name__ == '__main__':
    spider.put('http://iphone.myzaker.com/zaker/apps.php?act=getAllAppsData')
    #10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(10, 30)

Example #29
0
@route('/find/recommend')
class _(Handler):
    def get(self):
        now_id = int(self.get_argument("id", 0))
        page = int(self.get_argument("pi", 0))
        if now_id:
            for link in self.extract_all('<h3 class="nickname">', '</h3>'):
                link = extract('"/', '"', link)
                spider.put("http://xianguo.com/" + link)
            if page == 0:
                page_list = set(
                    self.extract_all("href=\"/find/recommend?pi=", "&"))
                for i in map(int, page_list):
                    if page:
                        spider.put(
                            "http://xianguo.com/find/recommend?id=%s&pi=%s" %
                            (now_id, page))
        else:
            for id in self.extract_all('href="/find/recommend?id=', '"'):
                spider.put("http://xianguo.com/find/recommend?id=%s&pi=0" % id)


if __name__ == '__main__':

    URL = 'http://xianguo.com/find/recommend'
    spider.put(URL)

    #10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(10, 30)
Example #30
0
import _env
from os.path import abspath, dirname, join
from operator import itemgetter
from html2txt import html2txt


@route("/page/(\d+)/")
class _(Handler):
    def get(self, page):
        for html in self.extract_all(
                '<h2><a  target="_blank" href="http://blog.jobbole.com',
                '<!-- .entry-content -->'):
            id = html[:html.find('"')]
            title = extract('/', '<', html).split(">", 1)[-1]
            link_html = extract('<div class="entry-content">', '</p>', html)
            link_html = extract('<p', None, link_html)
            txt = html2txt(link_html)
            if "http://" in txt:
                print "http://blog.jobbole.com%s" % id
                print title
                print txt
                print ""


if __name__ == '__main__':
    for i in xrange(1, 159):
        spider.put('http://blog.jobbole.com/page/%s/' % i)

    #10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(10, 30)
Example #31
0
            doc_id = collection.insert(item)
#           add a document to the index
            solrConnection.add(id = doc_id, title = item.get('title'), description = item.get('description'), subtitle = item.get('subtitle'), information = item.get('information'))
#           commit to solr
        solrConnection.commit()

@route('/images/.+')
class pic(Handler):
    def get(self):
        save_pic(self.html, route.path.split('/')[-1])

@route('/wp-content/uploads/.+')
class pic2(Handler):
    def get(self):
        save_pic(self.html, route.path.split('/')[-1])


def save_pic(content, fname):
    basepath = join(PREFIX, 'images')
    fpath = join(basepath, fname)
    f = open(fpath, 'wb')
    f.write(content)
    f.close()
    print 'Download image: ' + fname

if __name__ == '__main__':
    spider.put('http://www.durex.com.cn/products')
    spider.run(5,100)
    item.writedb()

Example #32
0

@route('/find/recommend')
class _(Handler):
    def get(self):
        now_id = int(self.get_argument("id", 0))
        page = int(self.get_argument("pi", 0))
        if now_id:
            for link in self.extract_all('<h3 class="nickname">','</h3>'):
                link = extract('"/','"', link)
                spider.put("http://xianguo.com/"+link)
            if page == 0:
                page_list = set(self.extract_all("href=\"/find/recommend?pi=","&"))
                for i in map(int,page_list):
                    if page:
                        spider.put("http://xianguo.com/find/recommend?id=%s&pi=%s"%(now_id,page))
        else:
            for id in self.extract_all(
                'href="/find/recommend?id=', '"'
            ):
                spider.put("http://xianguo.com/find/recommend?id=%s&pi=0"%id)

if __name__ == '__main__':

    URL = 'http://xianguo.com/find/recommend'
    spider.put(URL)

    #10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(10, 30)    

Example #33
0
                    if link.find (site) == -1:
                        link = "http://%s%s" % (site, link)
                    if not self.indexes.has_key (link):
                        print 'link_index', link
                        self.indexes[link] = None
                        spider.put (link)

template = unicode("""---
title: %s
date: %s
categories: %s
---
""", 'utf-8')

if __name__ == '__main__':
    spider.put ("http://%s/index_1.html" % (site))
#    spider.put ("http://frostyplanet.blogbus.com/c1566502/")
    spider.run (5, 5)
        
    if not os.path.exists (os.path.join (PREFIX, "output")):
        os.mkdir ("output")
    for _id, title, times, category, html in blog_post.posts:
        file_path = os.path.join (os.path.join (PREFIX, "output", str(category) + "_" + str(_id)))
        content = template % (title, times, category) + unicode(html, 'utf-8')
        f = open (file_path, "w")
        try:
            f.write (content.encode ('utf-8'))
        finally:
            f.close ()
        print "wrote", file_path
    
Example #34
0
#coding:utf-8
from spider.spider import route, Handler, spider
import _env
from os.path import abspath, dirname, join
from operator import itemgetter

PREFIX = join(dirname(abspath(__file__)))


@route('/snap/(\d+)')
class portal(Handler):
    def get(self, id):
        link = self.extract('源地址:<a href="', '"')
        #name = self.extract('作者:','&nbsp;&nbsp;|&nbsp;&nbsp;')
        title = self.extract('<h1 style="margin-bottom:0;">', '</h1>')
        if link:
            print id, link, title.replace("\n", " ")


if __name__ == '__main__':

    URL = "http://ucdchina.com/snap/%s"
    for i in xrange(1250, 12460 + 1):
        spider.put(URL % i)

    #10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(10, 30)