class RabbitDoc(Item): url = XPath('//div[@class="content_box2"]/a/@href') title = XPath('//div[@class="content_box2"]/a/div/h2/text()') class Meta: source = XPath('//div[@class="clearfix content_box estate_external"]') route = {'/House?Community=:community': '/:community/s5'}
class Post(Item): url = XPath('//a/@href') title = XPath('//a/text()') class Meta: source = XPath('//div[@class="result"]') route = '\.+'
class Movie(Item): url = XPath('//a[@class="zoom"]/@href') title = XPath('//a[@class="zoom"]/@title') class Meta: source = XPath('//*[@id="post_container"]/li') route = '/'
class Post(Item): url = XPath('//a[@class="storylink"][1]/@href') title = XPath('//a[@class="storylink"][1]/text()') class Meta: source = XPath('//tr[@class="athing"]') route = '/'
class User(Item): url = XPath('//a[@class="hnuser"][1]/@href') name = XPath('//a[@class="hnuser"][1]/text()') class Meta: source = XPath('//tr[@class="athing"]') route = '/news\?p=\d+'
class HotBook(MyItem): __base_url__ = 'http://91baby.mama.cn' title = XPath('//a[@class="xst"]/text()[1]') author = XPath('//a[@class="xst"]/text()[1]') url = XPath('//a[@class="xst"]/@href') book_id = XPath('//a[@class="xst"]/@href') def clean_title(self, title): if '《' in title: return title[title.find('\u300a') + 1:title.find('\u300b')][:10] else: return None def clean_author(self, author): if ':' in author: return author[author.find(':') + 1:author.find('(')] elif ':' in author: return author[author.find(':') + 1:author.find('(')] else: return None def clean_book_id(self, book_id): return book_id.split('-')[1] class Meta: source = XPath('//tbody[@class="thread_tbody"]') route = {'/hotbook?page=:page': '/forum-171-:page.html'}
class Book(Item): __base_url__ = 'http://91baby.mama.cn' title = XPath('//*[@id="wp"]/div[3]/text()[3]') author = XPath('//*[@id="wp"]/div[3]/text()[3]') page = XPath('//div[@class="pg"]/a[@class="last"]/text()') contents = XPath('//td[@class="t_f"]') def clean_title(self, title): return title.split('《')[1].split('》')[0] def clean_author(self, author): index = author.find('作者:') + 3 return author[index:] def clean_contents(self, contents): text = [] for item in contents: content = strip(item.xpath('string(.)')) if len(content) < 128: text.append('全书完结!!! 以下的内容是网友书评!') text.append(content) return text def clean_page(self, page): num = (len(page)) print(page) if num == 0: return 1 else: return int(page[0].replace('...', '')) class Meta: source = None route = {'/book?id=:id?page=:page': '/thread-:id-:page-1.html'}
class Post(Item): url = XPath('//a[@class="storylink"][1]/@href') title = XPath('//a[@class="storylink"][1]/text()') class Meta: source = XPath('//tr[@class="athing"]') route = {'/all?page=:page': '/news?p=:page'}
class SearchResult(Item): __base_url__ = 'https://medicament.ma' slug = XPath('//td/a/@href') nom = XPath('//td/a/span[@class="details"]/text()') type = XPath('//td/a/span[@class="details"]/text()') format = XPath('//td/a/span[@class="details"]/span[@class="small"]/text()') class Meta: source = XPath('//div[@class="search-results"]//table/tbody/tr') route = { '/search/?q=:query&c=:choice&k=:keyword': ('/?s=:query&choice=:choi' 'ce&keyword=:keyword') } def clean_slug(self, urls): for url in urls: if re.match('^{}/medicament/'.format(self.__base_url__), url): return url.rstrip('/').split('/')[-1] def clean_nom(self, nom): return nom[0].strip().split(',')[0].strip() def clean_type(self, type): return type[0].strip().split(',')[1].strip()
class Movies(Item): url = XPath('//b//a[@class="ulink"]/@href') title = XPath('//b//a[@class="ulink"]/text()') class Meta: source = XPath('//table[@class="tbspan"]') route = '/html/gndy/dyzz/index_\d+.html'
class Post(Item): __base_url__ = 'https://news.ycombinator.com' url = XPath('//a[@class="storylink"]/@href') title = XPath('//a[@class="storylink"]/text()') class Meta: source = XPath('//tr[@class="athing"]') route = '/news\?p=\d+'
class Movie(Item): __base_url__ = 'http://www.dy2018.com' url = XPath('//b//a[@class="ulink"]/@href') title = XPath('//b//a[@class="ulink"]/text()') class Meta: source = XPath('//table[@class="tbspan"]') route = '/html/gndy/dyzz/index_\d+.html'
class MovieList(Item): url = XPath('//b//a[@class="ulink"]/@href') title = XPath('//b//a[@class="ulink"]/text()') class Meta: source = XPath('//table[@class="tbspan"]') route = '/html/gndy/dyzz/(index_\d+.html)?' def clean_url(self, url): return '/movies/{}/'.format(url.split('/')[-1].split('.')[0])
class Post(Item): # title = XPath('//a[@class="js_triggerGray js_fanglist_title"]/@title') url = XPath('//a[@class="js_triggerGray js_fanglist_title"]/@href') # price_total = XPath('//div[@class="price"]/span/text()') # price_unit = XPath('//div[@class="price"]/text()[2]') one_room = XPath('//div[@class="where"]/span[1]/text()') room_pic = XPath('//img[@class="lj-lazy"]/@data-img-layout') class Meta: source = XPath('//ul[@class="house-lst js_fang_list"]/li') route = json.loads(open('post_route').read())
class HotBook(Item): __base_url__ = 'http://91baby.mama.cn' title = XPath('//a[@class="xst"]/text()[1]') url = XPath('//a[@class="xst"]/@href') book_id = XPath('//a[@class="xst"]/@href') class Meta: source = XPath('//tbody[@class="thread_tbody"]') route = {'/hotbook?page=:page': '/forum-171-:page.html'} def clean_book_id(self, book_id): return book_id.split('-')[1]
class MovieData(Item): title = XPath('//h1/text()') year = XPath('//h2[1]/text()') genre = XPath('//h2[2]/text()') imdb_rating = XPath('//span[@itemprop="ratingValue"]/text()') def clean_genre(self, genre): genre = [gen.strip() for gen in genre.split('/')] return genre class Meta: source = XPath('//div[@id="movie-info"]') route = {'/movie_data?href=:href': '/movie/:href'}
class MovieList(Item): url = XPath('//b//a[@class="ulink"]/@href') title = XPath('//b//a[@class="ulink"]/text()') class Meta: source = XPath('//table[@class="tbspan"]') route = { '/movies/?page=1': '/html/gndy/dyzz/', '/movies/?page=:page': '/html/gndy/dyzz/index_:page.html', '/movies/': '/html/gndy/dyzz/' } def clean_url(self, url): return '/movies/{}/'.format(url.split('/')[-1].split('.')[0])
class Detail(Item): __base_url__ = 'https://medicament.ma/medicament' field = XPath('//td[@class="field"]') value = XPath('//td[@class="value"]') class Meta: source = XPath('//div[@class="single single-medicament"]//table//tr') route = { '/detail/:slug': '/:slug' } def clean_field(self, field): return field.split(':')[0].rstrip()
class Pixabay(Item): __base_url__ = 'https://pixabay.com/' img = XPath('//a//img/@src') class Meta: source = XPath('//div[@class="item"]') route = {'/pic/?q=:key': '/zh/photos/?q=:key'}
class Meta: source = XPath('//div[@class="pager"]') route = { '/category/:cat/': '/category/:cat/', '/category/:cat/?page=:page': '/category/:cat/?page=:page', '/search/:keyword': '/search/?keyword=:keyword&cat=1001' }
class Meta: source = XPath('//div[contains(@class, "main-panel")]//div[@class="normal-recipe-list"]/ul[@class="list"]/li') route = { '/category/:cat/': '/category/:cat/', '/category/:cat/?page=:page': '/category/:cat/?page=:page', '/search/:keyword': '/search/?keyword=:keyword&cat=1001' }
class Meta: source = XPath('//div[@class="search-results"]//table/tbody/tr') route = { '/search/?q=:query&c=:choice&k=:keyword': ('/?s=:query&choice=:choi' 'ce&keyword=:keyword') }
class Meta: source = XPath('//table[@class="tbspan"]') route = { '/movies/?page=1': '/html/gndy/dyzz/', '/movies/?page=:page': '/html/gndy/dyzz/index_:page.html', '/movies/': '/html/gndy/dyzz/' }
class Pexels(Item): __base_url__ = 'https://www.pexels.com' img = XPath('//a//img/@src') class Meta: source = XPath('//article[@class="photo-item"]') route = {'/pic/?q=:key': '/search/:key/'}
class GIO(Item): gio_list = XPath('//div[@class="option-list gio_district"]/a/@href') class Meta: source = None route = { '/zufang/': '/zufang/', }
class Book(Item): __base_url__ = 'http://91baby.mama.cn' title = XPath('//*[@id="wp"]/div[3]/text()[3]') author = XPath('//*[@id="wp"]/div[3]/text()[3]') total_page = XPath('//span[@class="pgt"]/div//a') contents = XPath('//td[@class="t_f"]') def clean_title(self, title): return title.split('《')[1].split('》')[0] def clean_author(self, author): index = author.find('作者:') + 3 return author[index:] def clean_contents(self, contents): chapters = {} for index, item in enumerate(contents): content = strip(item.xpath('string(.)')) # 去掉开头废话 if '当前被收藏数' not in content: chapters[index] = content book_contents = {} for k, v in chapters.items(): # 过滤超断行 texts = strip_list(v.split('\n')) book_contents[k] = texts return book_contents def clean_total_page(self, total_page): try: for index, page in enumerate(total_page): num = page.xpath('./text()')[0] if num == '下一页': i = int(index) - 1 break page = total_page[i].xpath('./text()')[0] if '...' in page: return int(page.replace('... ', '')) return int(page) except: return 1 class Meta: source = None route = {'/book_id=:id?page=:page': '/thread-:id-:page-1.html'}
class Page(Item): next_page = XPath('//a[@class="morelink"]/@href') def clean_next_page(self, next_page): return "/https://news.ycombinator.com/" + next_page class Meta: source = None route = '/news\?p=\d+'
class Meta: source = XPath( '//div[@class="ing-recipe"]/div[@class="normal-recipe-list"]/ul[@class="list"]/li' ) route = { '/category/:cat/': '/category/:cat/', '/category/:cat/?page=:page': '/category/:cat/?page=:page', '/search/:keyword': '/search/?keyword=:keyword&cat=1001' }
class Page(Item): next_page = XPath('//a[@class="morelink"]/@href') class Meta: source = None route = {'/all?page=:page': '/news?p=:page'} def clean_next_page(self, next_page): return "http://127.0.0.1:5000/" + str(next_page)
class Page(Item): next_page = XPath('//a[@class="morelink"]/@href') class Meta: source = None route = '/news\?p=\d+' def clean_next_page(self, next_page): return "http://127.0.0.1:5000/" + next_page