Beispiel #1
0
@route('/find/recommend')
class _(Handler):
    def get(self):
        now_id = int(self.get_argument("id", 0))
        page = int(self.get_argument("pi", 0))
        if now_id:
            for link in self.extract_all('<h3 class="nickname">', '</h3>'):
                link = extract('"/', '"', link)
                spider.put("http://xianguo.com/" + link)
            if page == 0:
                page_list = set(
                    self.extract_all("href=\"/find/recommend?pi=", "&"))
                for i in map(int, page_list):
                    if page:
                        spider.put(
                            "http://xianguo.com/find/recommend?id=%s&pi=%s" %
                            (now_id, page))
        else:
            for id in self.extract_all('href="/find/recommend?id=', '"'):
                spider.put("http://xianguo.com/find/recommend?id=%s&pi=0" % id)


if __name__ == '__main__':

    URL = 'http://xianguo.com/find/recommend'
    spider.put(URL)

    #10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(10, 30)
Beispiel #2
0
            doc_id = collection.insert(item)
#           add a document to the index
            solrConnection.add(id = doc_id, title = item.get('title'), description = item.get('description'), subtitle = item.get('subtitle'), information = item.get('information'))
#           commit to solr
        solrConnection.commit()

@route('/images/.+')
class pic(Handler):
    def get(self):
        save_pic(self.html, route.path.split('/')[-1])

@route('/wp-content/uploads/.+')
class pic2(Handler):
    def get(self):
        save_pic(self.html, route.path.split('/')[-1])


def save_pic(content, fname):
    basepath = join(PREFIX, 'images')
    fpath = join(basepath, fname)
    f = open(fpath, 'wb')
    f.write(content)
    f.close()
    print 'Download image: ' + fname

if __name__ == '__main__':
    spider.put('http://www.durex.com.cn/products')
    spider.run(5,100)
    item.writedb()

Beispiel #3
0
        html = html[:html.rfind('</div>')]
        tid = int(self.get_argument('tid'))
        print tid, name
        self.page.append((tid, self.request.url, name, html))

    @classmethod
    def write(cls):
        page = cls.page
        page.sort(key=itemgetter(0), reverse=True)
        with open(join(PREFIX, 'ecocn_org.xml'), 'w') as rss:
            rss.write(
                cls.template.render(
                    rss_title='经济学人 . 中文网',
                    rss_link='http://www.ecocn.org',
                    li=[
                        dict(
                            link=link,
                            title=title,
                            txt=txt
                        ) for id, link, title, txt in cls.page
                    ]
                )
            )

if __name__ == '__main__':
    spider.put('http://www.ecocn.org/portal.php?mod=list&catid=1')
    #10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(3, 30)
    forum.write()

Beispiel #4
0
        html = self.extract('<div class="t_fsz">', '<div id="comment_')
        html = html[:html.rfind('</div>')]
        tid = int(self.get_argument('tid'))
        print tid, name
        self.page.append((tid, self.request.url, name, html))

    @classmethod
    def write(cls):
        page = cls.page
        page.sort(key=itemgetter(0), reverse=True)
        with open(join(PREFIX, 'ecocn_org.xml'), 'w') as rss:
            rss.write(
                cls.template.render(
                    rss_title='经济学人 . 中文网',
                    rss_link='http://www.ecocn.org',
                    li=[
                        dict(
                            link=link,
                            title=title,
                            txt=txt
                        ) for id, link, title, txt in cls.page
                    ]
                )
            )

if __name__ == '__main__':
    spider.put('http://www.ecocn.org/portal.php?mod=list&catid=1')
    #10个并发抓取线程 , 网页读取超时时间为30秒
    spider.run(10, 30)

                    if not self.indexes.has_key (link):
                        print 'link_index', link
                        self.indexes[link] = None
                        spider.put (link)

template = unicode("""---
title: %s
date: %s
categories: %s
---
""", 'utf-8')

if __name__ == '__main__':
    spider.put ("http://%s/index_1.html" % (site))
#    spider.put ("http://frostyplanet.blogbus.com/c1566502/")
    spider.run (5, 5)
        
    if not os.path.exists (os.path.join (PREFIX, "output")):
        os.mkdir ("output")
    for _id, title, times, category, html in blog_post.posts:
        file_path = os.path.join (os.path.join (PREFIX, "output", str(category) + "_" + str(_id)))
        content = template % (title, times, category) + unicode(html, 'utf-8')
        f = open (file_path, "w")
        try:
            f.write (content.encode ('utf-8'))
        finally:
            f.close ()
        print "wrote", file_path
    

Beispiel #6
0
            array_files = output.split('|')
            return array_files

    def get_volume_name(self):
        volume_name = VOLUMES_NAME_DICT[self.request.url]
        return volume_name

    def get(self):
        for segment in self.extract_all('<script>var s_files=\"', '</script>'):
            segment_list = segment.split('";')
            s_files = segment_list[0]
            # http://www.blgl8.com/script/ds/ds.js
            s_ds = 'http://comic.1mh.in:2813/'
            s_path = segment_list[1].split('="')[1]
            array_files = self.get_array_files(s_files)
            volume_name = self.get_volume_name()

            for pic_file in array_files:
                # print volume_name + s_ds + s_path + pic_file
                wget_command = 'wget ' + s_ds + s_path + pic_file + \
                               " -P " + volume_name
                print wget_command
                # status, output = commands.getstatusoutput(wget_command)
                # print status, output


if __name__ == '__main__':
    # spider.put('http://www.blgl8.com/comic-i/blgl44469/')
    spider.put('http://www.blgl8.com/comic-i/blgl39895/')
    spider.run(1, 5)
Beispiel #7
0
            number = filter(lambda ch: ch in "0123456789", li.text)
            bracket_index = li.text.index("(")
            brand_name = li.text[0:bracket_index]
            brand_list.append(brand_name + " " + number)
            print brand_name + ":" + number

            with open(join(PREFIX, "brands.txt"), "w") as file:
                for brand in brand_list:
                    file.write("%s\n" % brand)
            file.close()

            page_size = 20
            page_num = 0
            if int(number) % page_size != 0:
                page_num = int(number) / page_size + 1
            else:
                page_num = int(number) / page_size
            for pn in range(page_num):
                page = pn + 1
                url = link + "pn" + str(page)
                print "Fetch data from: " + HTTP % url
                spider.put(HTTP % url)


if __name__ == "__main__":
    #    spider.put('http://www.yeedou.com/anquantao-c2441/')
    readHtml()
    spider.run(5, 3000)
    item.writedb()