class Page(Item): current = Attr('div.mt20 > div.pages > a.anext', 'href') total = Attr('div.mt20 > div.pages > a.alast', 'href') def clean_total(self, total): return int(total.rsplit('/', 1)[1]) def clean_current(self, current): print(current) return int(current.rsplit('/', 1)[1]) - 20
class Repic(Item): img = Attr('a.cook-img', 'style') url = Attr('a.cook-img', 'href') title = Text('div.cook-info > a.cookname') major = Text('div.cook-info > p.major') def clean_img(self, img): re_img = re.compile( 'background: url[(](.*)[)] no-repeat center center;background-size: cover;position: relative;' ) return re_img.match(img).groups()[0]
class Page(Item): next_page = Attr('.morelink', 'href') def clean_next_page(self, value): return api.convert_string( '/' + value, '/news?p={page}', request.host_url.strip('/') + '/posts?page={page}')
class Cookbook(Item): title = Text('h2.title') img = Attr('.recipe-content > div > div > div > a > img', 'src') browse_count = Text('div.vcnum > span') collect_count = Text('div.vcnum > span.collectnum') intro = Text('p.intro') tip = Text('div.tips > p')
class Page(Item): next_page = Attr(".morelink", "href") def clean_next_page(self, value): return api.convert_string( "/" + value, "/news?p={page}", request.host_url.strip("/") + "/posts?page={page}", )
import requests from htmlparsing import Element, HTMLParsing, Text, Attr, Parse url = 'http://localhost:8082/home/serveList.html' r = requests.get(url) article_detail = HTMLParsing(r.text).detail({ 'title': Text('a.storylink'), 'points': Parse('span.score', '>{} points'), 'link': Attr('a.storylink', 'href') }) print(article_detail)
class Post(Item): url = Attr('.read-more', 'href') title = Text('h1 > a')
class Post(Item): url = Attr('.storylink', 'no this attribute') title = Text('.storylink')
class Post(Item): url = Attr('.storylink', 'href') title = Text('.storylink')
class Post(Item): url = Attr(".storylink", "href") title = Text(".storylink")
class Post(Item): url = Attr(".storylink", "no this attribute") title = Text(".storylink")
class Course(Item): url = Attr('a', 'href') title = Text('h4')
class Step(Item): img = Attr('img', 'src') step = Text('p')
class Selected(Item): title = Text('.name') img = Attr('a > img', 'src') url = Attr('.name', 'href')