Beispiel #1
0
class RabbitDoc(Item):
    url = XPath('//div[@class="content_box2"]/a/@href')
    title = XPath('//div[@class="content_box2"]/a/div/h2/text()')

    class Meta:
        source = XPath('//div[@class="clearfix content_box estate_external"]')
        route = {'/House?Community=:community': '/:community/s5'}
Beispiel #2
0
class Post(Item):
    url = XPath('//a/@href')
    title = XPath('//a/text()')

    class Meta:
        source = XPath('//div[@class="result"]')
        route = '\.+'
Beispiel #3
0
class Movie(Item):
    url = XPath('//a[@class="zoom"]/@href')
    title = XPath('//a[@class="zoom"]/@title')

    class Meta:
        source = XPath('//*[@id="post_container"]/li')
        route = '/'
Beispiel #4
0
class Post(Item):
    url = XPath('//a[@class="storylink"][1]/@href')
    title = XPath('//a[@class="storylink"][1]/text()')

    class Meta:
        source = XPath('//tr[@class="athing"]')
        route = '/'
Beispiel #5
0
class User(Item):
    url = XPath('//a[@class="hnuser"][1]/@href')
    name = XPath('//a[@class="hnuser"][1]/text()')

    class Meta:
        source = XPath('//tr[@class="athing"]')
        route = '/news\?p=\d+'
Beispiel #6
0
class HotBook(MyItem):
    __base_url__ = 'http://91baby.mama.cn'
    title = XPath('//a[@class="xst"]/text()[1]')
    author = XPath('//a[@class="xst"]/text()[1]')
    url = XPath('//a[@class="xst"]/@href')
    book_id = XPath('//a[@class="xst"]/@href')

    def clean_title(self, title):
        if '《' in title:
            return title[title.find('\u300a') + 1:title.find('\u300b')][:10]
        else:
            return None

    def clean_author(self, author):
        if ':' in author:
            return author[author.find(':') + 1:author.find('(')]
        elif ':' in author:
            return author[author.find(':') + 1:author.find('(')]
        else:
            return None

    def clean_book_id(self, book_id):
        return book_id.split('-')[1]

    class Meta:
        source = XPath('//tbody[@class="thread_tbody"]')
        route = {'/hotbook?page=:page': '/forum-171-:page.html'}
Beispiel #7
0
class Book(Item):
    __base_url__ = 'http://91baby.mama.cn'

    title = XPath('//*[@id="wp"]/div[3]/text()[3]')
    author = XPath('//*[@id="wp"]/div[3]/text()[3]')
    page = XPath('//div[@class="pg"]/a[@class="last"]/text()')
    contents = XPath('//td[@class="t_f"]')

    def clean_title(self, title):
        return title.split('《')[1].split('》')[0]

    def clean_author(self, author):
        index = author.find('作者:') + 3
        return author[index:]

    def clean_contents(self, contents):
        text = []
        for item in contents:
            content = strip(item.xpath('string(.)'))
            if len(content) < 128:
                text.append('全书完结!!! 以下的内容是网友书评!')
            text.append(content)
        return text

    def clean_page(self, page):
        num = (len(page))
        print(page)
        if num == 0:
            return 1
        else:
            return int(page[0].replace('...', ''))

    class Meta:
        source = None
        route = {'/book?id=:id?page=:page': '/thread-:id-:page-1.html'}
Beispiel #8
0
    class Post(Item):
        url = XPath('//a[@class="storylink"][1]/@href')
        title = XPath('//a[@class="storylink"][1]/text()')

        class Meta:
            source = XPath('//tr[@class="athing"]')
            route = {'/all?page=:page': '/news?p=:page'}
Beispiel #9
0
class SearchResult(Item):
    __base_url__ = 'https://medicament.ma'
    slug = XPath('//td/a/@href')
    nom = XPath('//td/a/span[@class="details"]/text()')
    type = XPath('//td/a/span[@class="details"]/text()')
    format = XPath('//td/a/span[@class="details"]/span[@class="small"]/text()')

    class Meta:
        source = XPath('//div[@class="search-results"]//table/tbody/tr')
        route = {
            '/search/?q=:query&c=:choice&k=:keyword':
            ('/?s=:query&choice=:choi'
             'ce&keyword=:keyword')
        }

    def clean_slug(self, urls):
        for url in urls:
            if re.match('^{}/medicament/'.format(self.__base_url__), url):
                return url.rstrip('/').split('/')[-1]

    def clean_nom(self, nom):
        return nom[0].strip().split(',')[0].strip()

    def clean_type(self, type):
        return type[0].strip().split(',')[1].strip()
Beispiel #10
0
class Movies(Item):
    url = XPath('//b//a[@class="ulink"]/@href')
    title = XPath('//b//a[@class="ulink"]/text()')

    class Meta:
        source = XPath('//table[@class="tbspan"]')
        route = '/html/gndy/dyzz/index_\d+.html'
Beispiel #11
0
class Post(Item):
    __base_url__ = 'https://news.ycombinator.com'

    url = XPath('//a[@class="storylink"]/@href')
    title = XPath('//a[@class="storylink"]/text()')

    class Meta:
        source = XPath('//tr[@class="athing"]')
        route = '/news\?p=\d+'
Beispiel #12
0
class Movie(Item):
    __base_url__ = 'http://www.dy2018.com'

    url = XPath('//b//a[@class="ulink"]/@href')
    title = XPath('//b//a[@class="ulink"]/text()')

    class Meta:
        source = XPath('//table[@class="tbspan"]')
        route = '/html/gndy/dyzz/index_\d+.html'
Beispiel #13
0
class MovieList(Item):
    url = XPath('//b//a[@class="ulink"]/@href')
    title = XPath('//b//a[@class="ulink"]/text()')

    class Meta:
        source = XPath('//table[@class="tbspan"]')
        route = '/html/gndy/dyzz/(index_\d+.html)?'

    def clean_url(self, url):
        return '/movies/{}/'.format(url.split('/')[-1].split('.')[0])
Beispiel #14
0
class Post(Item):
    # title = XPath('//a[@class="js_triggerGray js_fanglist_title"]/@title')
    url = XPath('//a[@class="js_triggerGray js_fanglist_title"]/@href')
    # price_total = XPath('//div[@class="price"]/span/text()')
    # price_unit = XPath('//div[@class="price"]/text()[2]')
    one_room = XPath('//div[@class="where"]/span[1]/text()')
    room_pic = XPath('//img[@class="lj-lazy"]/@data-img-layout')

    class Meta:
        source = XPath('//ul[@class="house-lst js_fang_list"]/li')
        route = json.loads(open('post_route').read())
Beispiel #15
0
class HotBook(Item):
    __base_url__ = 'http://91baby.mama.cn'
    title = XPath('//a[@class="xst"]/text()[1]')
    url = XPath('//a[@class="xst"]/@href')
    book_id = XPath('//a[@class="xst"]/@href')

    class Meta:
        source = XPath('//tbody[@class="thread_tbody"]')
        route = {'/hotbook?page=:page': '/forum-171-:page.html'}

    def clean_book_id(self, book_id):
        return book_id.split('-')[1]
class MovieData(Item):
	title = XPath('//h1/text()')
	year = XPath('//h2[1]/text()')
	genre = XPath('//h2[2]/text()')
	imdb_rating = XPath('//span[@itemprop="ratingValue"]/text()')

	def clean_genre(self, genre):
		genre = [gen.strip() for gen in genre.split('/')]
		return genre

	class Meta:
		source = XPath('//div[@id="movie-info"]')
		route = {'/movie_data?href=:href': '/movie/:href'}
Beispiel #17
0
class MovieList(Item):
    url = XPath('//b//a[@class="ulink"]/@href')
    title = XPath('//b//a[@class="ulink"]/text()')

    class Meta:
        source = XPath('//table[@class="tbspan"]')
        route = {
            '/movies/?page=1': '/html/gndy/dyzz/',
            '/movies/?page=:page': '/html/gndy/dyzz/index_:page.html',
            '/movies/': '/html/gndy/dyzz/'
        }

    def clean_url(self, url):
        return '/movies/{}/'.format(url.split('/')[-1].split('.')[0])
Beispiel #18
0
class Detail(Item):
    __base_url__ = 'https://medicament.ma/medicament'

    field = XPath('//td[@class="field"]')
    value = XPath('//td[@class="value"]')

    class Meta:
        source = XPath('//div[@class="single single-medicament"]//table//tr')
        route = {
            '/detail/:slug': '/:slug'
        }

    def clean_field(self, field):
        return field.split(':')[0].rstrip()
Beispiel #19
0
class Pixabay(Item):
    __base_url__ = 'https://pixabay.com/'
    img = XPath('//a//img/@src')

    class Meta:
        source = XPath('//div[@class="item"]')
        route = {'/pic/?q=:key': '/zh/photos/?q=:key'}
Beispiel #20
0
 class Meta:
     source = XPath('//div[@class="pager"]')
     route = {
         '/category/:cat/': '/category/:cat/',
         '/category/:cat/?page=:page': '/category/:cat/?page=:page',
         '/search/:keyword': '/search/?keyword=:keyword&cat=1001'
     }
Beispiel #21
0
 class Meta:
     source = XPath('//div[contains(@class, "main-panel")]//div[@class="normal-recipe-list"]/ul[@class="list"]/li')
     route = {
         '/category/:cat/': '/category/:cat/',
         '/category/:cat/?page=:page': '/category/:cat/?page=:page',
         '/search/:keyword': '/search/?keyword=:keyword&cat=1001'
     }
Beispiel #22
0
 class Meta:
     source = XPath('//div[@class="search-results"]//table/tbody/tr')
     route = {
         '/search/?q=:query&c=:choice&k=:keyword':
         ('/?s=:query&choice=:choi'
          'ce&keyword=:keyword')
     }
Beispiel #23
0
 class Meta:
     source = XPath('//table[@class="tbspan"]')
     route = {
         '/movies/?page=1': '/html/gndy/dyzz/',
         '/movies/?page=:page': '/html/gndy/dyzz/index_:page.html',
         '/movies/': '/html/gndy/dyzz/'
     }
Beispiel #24
0
class Pexels(Item):
    __base_url__ = 'https://www.pexels.com'
    img = XPath('//a//img/@src')

    class Meta:
        source = XPath('//article[@class="photo-item"]')
        route = {'/pic/?q=:key': '/search/:key/'}
Beispiel #25
0
class GIO(Item):
    gio_list = XPath('//div[@class="option-list gio_district"]/a/@href')

    class Meta:
        source = None
        route = {
            '/zufang/': '/zufang/',
        }
Beispiel #26
0
class Book(Item):
    __base_url__ = 'http://91baby.mama.cn'

    title = XPath('//*[@id="wp"]/div[3]/text()[3]')
    author = XPath('//*[@id="wp"]/div[3]/text()[3]')
    total_page = XPath('//span[@class="pgt"]/div//a')
    contents = XPath('//td[@class="t_f"]')

    def clean_title(self, title):
        return title.split('《')[1].split('》')[0]

    def clean_author(self, author):
        index = author.find('作者:') + 3
        return author[index:]

    def clean_contents(self, contents):
        chapters = {}
        for index, item in enumerate(contents):
            content = strip(item.xpath('string(.)'))
            # 去掉开头废话
            if '当前被收藏数' not in content:
                chapters[index] = content
        book_contents = {}
        for k, v in chapters.items():
            # 过滤超断行
            texts = strip_list(v.split('\n'))
            book_contents[k] = texts
        return book_contents

    def clean_total_page(self, total_page):
        try:
            for index, page in enumerate(total_page):
                num = page.xpath('./text()')[0]
                if num == '下一页':
                    i = int(index) - 1
                    break
            page = total_page[i].xpath('./text()')[0]
            if '...' in page:
                return int(page.replace('... ', ''))
            return int(page)
        except:
            return 1

    class Meta:
        source = None
        route = {'/book_id=:id?page=:page': '/thread-:id-:page-1.html'}
Beispiel #27
0
class Page(Item):
    next_page = XPath('//a[@class="morelink"]/@href')

    def clean_next_page(self, next_page):
        return "/https://news.ycombinator.com/" + next_page

    class Meta:
        source = None
        route = '/news\?p=\d+'
Beispiel #28
0
 class Meta:
     source = XPath(
         '//div[@class="ing-recipe"]/div[@class="normal-recipe-list"]/ul[@class="list"]/li'
     )
     route = {
         '/category/:cat/': '/category/:cat/',
         '/category/:cat/?page=:page': '/category/:cat/?page=:page',
         '/search/:keyword': '/search/?keyword=:keyword&cat=1001'
     }
Beispiel #29
0
    class Page(Item):
        next_page = XPath('//a[@class="morelink"]/@href')

        class Meta:
            source = None
            route = {'/all?page=:page': '/news?p=:page'}

        def clean_next_page(self, next_page):
            return "http://127.0.0.1:5000/" + str(next_page)
Beispiel #30
0
    class Page(Item):
        next_page = XPath('//a[@class="morelink"]/@href')

        class Meta:
            source = None
            route = '/news\?p=\d+'

        def clean_next_page(self, next_page):
            return "http://127.0.0.1:5000/" + next_page