def parse(self, response): sel = pyQuery(response.body) sites = sel('#tbl_proxy_list > tbody:eq(0) > tr') proxy = ProxyItem() for row in sites: dom = pyQuery(row) if dom('td center div'): continue else: ip = dom('td:eq(0) span').text() proxy["ip"] = ip.strip() if ip else "" port = dom('td:eq(1)').text() proxy["port"] = port.strip() if port else "" yield proxy
def analyze_info(self, url): """ 解析数据 :param url: 网页地址 """ house_list = [] doc = pyQuery(get_html(url, self.referer)) items = doc('.key-list .item-mod').items() for item in items: address = item.find('.address').text() # 去空格 index = address.find('\xa0', 2) address = ' '.join(address.split()) # 地区 city = '' if index >= 2: city = address[2:index] # 价格 price_desc = item.find('.price').text() or item.find( '.price-txt').text() house_info = { # 城市 'city': city, # 名称 # 'name': item.find('.lp-name h3').text(), 'name': item.find('.items-name').text(), # 户型 'house_type': ' '.join(item.find('.huxing').text().split()), # 地址 'address': address, # 地址链接 'address_link': item.find('.address').attr('href'), # 标签 'tags': item.find('.tag-panel').text(), # 价格 'price': price_desc, 'price_nu': analysis_price(price_desc), # 排名 'rank': item.find('.group-mark').text(), # 图片 'pic': item.find('.pic img').attr('src'), # 图片链接 'pic_link': item.children('.pic').attr('href'), 'report_date': self.report_date } # 加入列表中 house_list.append(house_info) self.total += len(house_list) # 本页数据批量存入MongoDB中 self.collection.insert(house_list) # 获取下一页,如果有下一页的,继续爬取下一页的内容 next_url = doc('.list-page .next-page').attr('href') if next_url: # 引用上一个访问地址 self.referer = url time.sleep(2) self.new_log.logger.info('next => %s' % next_url) self.analyze_info(next_url)
def analyze_info(self, url): """ 解析数据 :param url: 网页源码 """ house_list = [] doc = pyQuery(get_html(url, self.referer)) items = doc('#houselist-mod-new .list-item').items() for item in items: detail = ' '.join( item.find('.details-item').text().split()).split(' ') if len(detail) < 3: continue all_price_desc = item.find('.price-det').text() unit_price_desc = item.find('.unit-price').text() house_info = { # 区域 'city': detail[2].split('-')[0], # 名称 'name': detail[1], # 户型 'house_type': detail[0][0:detail[0].find('造') + 1], # 地址 'address': detail[2], # 标签 'tags': item.find('.tags-bottom').text(), # 总价 'all_price': all_price_desc, 'all_price_nu': analysis_price(all_price_desc), # 单价 'unit-price': unit_price_desc, 'unit-price_nu': analysis_price(unit_price_desc), # 图片 'pic': item.find('.item-img img').attr('src'), # 房源真实性 'authenticity': item.find('.house-title .house-icon').text(), 'report_date': self.report_date } # 加入列表中 house_list.append(house_info) self.total += len(house_list) # 批量存入MongoDB中 self.collection.insert(house_list) # 获取下一页,如果有下一页的,继续爬取下一页的内容 next_url = doc('.multi-page .aNxt').attr('href') if next_url: # 引用上一个访问地址 self.referer = url time.sleep(2) self.sale_log.logger.info('next => %s' % next_url) self.analyze_info(next_url)
def crawl_daili66(self, page_count=4): """ 获取代理66 :param page_count: 页码 :return: 代理 """ start_url = 'http://www.66ip.cn/{}.html' urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: print('Crawling', url) html = get_page(url) if html: doc = pyQuery(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() yield ':'.join([ip, port])
## 工单处理地址 operateAddr = 'http://omssd.sky-mobi.com:9040/oms-workflow/workflow/rundeploy/busiOpt!sendRetJSONStr.action' reqHeader = {'Cookie': cookie} req = urllib2.Request(addr, headers=reqHeader) print('正在读取任务列表, 等待响应, 大约需要10秒...') result = urllib2.urlopen(req).read() ## print(result) print('任务列表获取成功, 正在解析...') ########################################################################################################### taskList = [] ## 与jQ不同的是, pyQuery至少需要先实例化得到目标对象才行(毕竟不是内嵌的代码) pyQuery = pyQuery(result) ## items()返回一个生成器, 适合用来处理list类型的元素 for item in pyQuery('.datalist').items(): ele = item.find('td') firField = ele.eq(0).find('input') task = { 'taskId': firField.eq(3).attr('value'), 'workorderDefinitionKey': firField.eq(2).attr('value'), 'orderId': ele.eq(1).text(), ## 序号 'invoiceId': ele.eq(2).text(), ## 工单编号 'invoiceName': ele.eq(3).text(), ## 工单名称 'customer': ele.eq(4).text(), ## 提交人 'project': ele.eq(5).text(), ## 项目名称 'createDate': ele.eq(6).text(), ## 创建时间 'status': ele.eq(7).text(), ## 工单状态 'detail': ele.eq(8).text() ## 详情描述
from pyquery import PyQuery as pyQuery p = pyQuery('<p id="hello" class="hello"></p>')('p') p.addClass("todo") print(repr(p)) p.toggleClass("titi toto") print(repr(p)) p.removeClass("titi") print(repr(p)) p.css("front-size", "15px") print(repr(p)) print(p.attr('style')) p.css({"front-size": "17px"}) print(repr(p)) print(p.attr('style')) # same the python way p.css.front_size = "15px" print(p.attr.style) p.css["front-size"] = "15px" print(p.attr.style) p.css(front_size="16px") p.css = {"front-size": "15px"} print(p.attr.style)
def get_article(url): r = requests.get(url) document = pyquery.pyQuery(r.text) return document('#js_content').text()
# -*- encoding: utf-8 -*- from pyquery import PyQuery as pyQuery p = pyQuery('<p id="hello" class="hello"></p>')('p') print(p.attr('id')) print(p.attr("id", "plop")) print(p.attr("id", "hello")) # more python way p.attr.id ="plop" print(p.attr.id) p.attr['id']="ola" print(p.attr['id']) p.attr(id="hello",class_="hello2") print(repr(p)) p.attr.class_ ="hello" print(repr(p)) item = pyQuery('<li class="ls-disc-li jpriced" levenum="56" fsid="1591295333" fpid="1729504036">'+ '<div class="img"> '+ '<a target="_blank" href="//item.jd.com/1591295333.html"><img title="鸿星尔克男鞋夏秋防滑减震耐磨休闲鞋系带低帮MD大底运动鞋男 孔雀蓝/正黄. 41" alt="鸿星尔克男鞋夏秋防滑减震耐磨休闲鞋系带低帮MD大底运动鞋男 孔雀蓝/正黄. 41" src="//img14.360buyimg.com/n7/jfs/t1342/63/1413028126/240422/f6459617/55c45c99Nfb7344a0.jpg" data-lazyload="done"></a>'+ '</div> '+ '<h3 class="title"><a target="_blank" href="//item.jd.com/1591295333.html" title="鸿星尔克男鞋夏秋防滑减震耐磨休闲鞋系带低帮MD大底运动鞋男 孔雀蓝/正黄. 41">鸿星尔克男鞋夏秋防滑减震耐磨休闲鞋系带低帮MD大底运动鞋男 孔雀蓝/正黄. 41</a></h3>'+ '<p class="price-p">¥<strong>279</strong><del>京东价:<del>¥279</del></del></p> '+ '<p class="rank"> '+ '<em>专享会员:</em> '+ '<i class="rank18 rank18-2" title="铜牌会员"></i> '+ '<i class="rank18 rank18-3" title="银牌会员"></i> '+ '<i class="rank18 rank18-4" title="金牌会员"></i> '+ '<i class="rank18 rank18-5" title="钻石会员"></i> '+ '</p> '+