class Cookbook(Item): title = Text('h2.title') img = Attr('.recipe-content > div > div > div > a > img', 'src') browse_count = Text('div.vcnum > span') collect_count = Text('div.vcnum > span.collectnum') intro = Text('p.intro') tip = Text('div.tips > p')
class Repic(Item): img = Attr('a.cook-img', 'style') url = Attr('a.cook-img', 'href') title = Text('div.cook-info > a.cookname') major = Text('div.cook-info > p.major') def clean_img(self, img): re_img = re.compile( 'background: url[(](.*)[)] no-repeat center center;background-size: cover;position: relative;' ) return re_img.match(img).groups()[0]
class CourseSeats(Item): class_num = Text(".classNbrColumnValue > .course-details-link") total_seats = Text(".availableSeatsColumnValue") open_seats = total_seats _tmp_seats = None def clean_class_num(self, value): return value.strip() def clean_total_seats(self, value): if not self._tmp_seats: self._tmp_seats = value.split() return int(self._tmp_seats[2]) def clean_open_seats(self, value): if not self._tmp_seats: self._tmp_seats = value.split() return int(self._tmp_seats[0])
class ClassSeats(Item): total_seats = Text("#details-side-panel > span") open_seats = total_seats _tmp_seats = None def clean_total_seats(self, value): if not self._tmp_seats: self._tmp_seats = value.split() return int(self._tmp_seats[4]) def clean_open_seats(self, value): if not self._tmp_seats: self._tmp_seats = value.split() return int(self._tmp_seats[2])
class Class(Item): department = Text("h2") course = department title = department # school = Text(".row > .col-md-7 > span > a") # instructor = Attr(".nametip", "title") _tmp_course = None def clean_department(self, value): if not self._tmp_course: self._tmp_course = value.split() print(f"self._tmp_course = {self._tmp_course}") return self._tmp_course[0] def clean_course(self, value): if not self._tmp_course: self._tmp_course = value.split() return self._tmp_course[1] def clean_title(self, value): if not self._tmp_course: self._tmp_course = value.split() return ' '.join(self._tmp_course[3:])
import requests from htmlparsing import Element, HTMLParsing, Text, Attr, Parse url = 'http://localhost:8082/home/serveList.html' r = requests.get(url) article_detail = HTMLParsing(r.text).detail({ 'title': Text('a.storylink'), 'points': Parse('span.score', '>{} points'), 'link': Attr('a.storylink', 'href') }) print(article_detail)
class Post(Item): url = Attr('.read-more', 'href') title = Text('h1 > a')
class Post(Item): url = Attr('.storylink', 'no this attribute') title = Text('.storylink')
class Post(Item): url = Attr('.storylink', 'href') title = Text('.storylink')
class Post(Item): url = Attr(".storylink", "href") title = Text(".storylink")
class Post(Item): url = Attr(".storylink", "no this attribute") title = Text(".storylink")
class Course(Item): url = Attr('a', 'href') title = Text('h4')
class Ingredients(Item): ingredient = Text('span') weight = Text('span.right')
class Step(Item): img = Attr('img', 'src') step = Text('p')
class Selected(Item): title = Text('.name') img = Attr('a > img', 'src') url = Attr('.name', 'href')
"""<Result ('Skip to content',) {}>""" # Get content or html print(e.xpath('//a')[5].text) """PyPI""" print(e.xpath('//a')[5].html) """<a href="https://pypi.python.org/" title="Python Package Index">PyPI</a>""" print(e.xpath('//a')[5].markdown) """[PyPI](https://pypi.python.org/ "Python Package Index")""" url = 'https://news.ycombinator.com/' r = requests.get(url) article_list = HTMLParsing(r.text).list('.athing', { 'title': Text('a.storylink'), 'link': Attr('a.storylink', 'href') }) print(article_list) url = 'https://news.ycombinator.com/item?id=16476454' r = requests.get(url) article_detail = HTMLParsing(r.text).detail({ 'title': Text('a.storylink'), 'points': Parse('span.score', '>{} points'), 'link': Attr('a.storylink', 'href') }) print(article_detail)