Esempio n. 1
0
 def parse(self, response):
     sel = pyQuery(response.body)
     sites = sel('#tbl_proxy_list > tbody:eq(0) > tr')
     proxy = ProxyItem()
     for row in sites:
         dom = pyQuery(row)
         if dom('td center div'):
             continue
         else:
             ip = dom('td:eq(0) span').text()
             proxy["ip"] = ip.strip() if ip else ""
             port = dom('td:eq(1)').text()
             proxy["port"] = port.strip() if port else ""
             yield proxy
Esempio n. 2
0
 def analyze_info(self, url):
     """
     解析数据
     :param url: 网页地址
     """
     house_list = []
     doc = pyQuery(get_html(url, self.referer))
     items = doc('.key-list .item-mod').items()
     for item in items:
         address = item.find('.address').text()
         # 去空格
         index = address.find('\xa0', 2)
         address = ' '.join(address.split())
         # 地区
         city = ''
         if index >= 2:
             city = address[2:index]
         # 价格
         price_desc = item.find('.price').text() or item.find(
             '.price-txt').text()
         house_info = {
             # 城市
             'city': city,
             # 名称
             # 'name': item.find('.lp-name h3').text(),
             'name': item.find('.items-name').text(),
             # 户型
             'house_type': ' '.join(item.find('.huxing').text().split()),
             # 地址
             'address': address,
             # 地址链接
             'address_link': item.find('.address').attr('href'),
             # 标签
             'tags': item.find('.tag-panel').text(),
             # 价格
             'price': price_desc,
             'price_nu': analysis_price(price_desc),
             # 排名
             'rank': item.find('.group-mark').text(),
             # 图片
             'pic': item.find('.pic img').attr('src'),
             # 图片链接
             'pic_link': item.children('.pic').attr('href'),
             'report_date': self.report_date
         }
         # 加入列表中
         house_list.append(house_info)
     self.total += len(house_list)
     # 本页数据批量存入MongoDB中
     self.collection.insert(house_list)
     # 获取下一页,如果有下一页的,继续爬取下一页的内容
     next_url = doc('.list-page .next-page').attr('href')
     if next_url:
         # 引用上一个访问地址
         self.referer = url
         time.sleep(2)
         self.new_log.logger.info('next => %s' % next_url)
         self.analyze_info(next_url)
Esempio n. 3
0
 def analyze_info(self, url):
     """
     解析数据
     :param url: 网页源码
     """
     house_list = []
     doc = pyQuery(get_html(url, self.referer))
     items = doc('#houselist-mod-new .list-item').items()
     for item in items:
         detail = ' '.join(
             item.find('.details-item').text().split()).split(' ')
         if len(detail) < 3:
             continue
         all_price_desc = item.find('.price-det').text()
         unit_price_desc = item.find('.unit-price').text()
         house_info = {
             # 区域
             'city': detail[2].split('-')[0],
             # 名称
             'name': detail[1],
             # 户型
             'house_type': detail[0][0:detail[0].find('造') + 1],
             # 地址
             'address': detail[2],
             # 标签
             'tags': item.find('.tags-bottom').text(),
             # 总价
             'all_price': all_price_desc,
             'all_price_nu': analysis_price(all_price_desc),
             # 单价
             'unit-price': unit_price_desc,
             'unit-price_nu': analysis_price(unit_price_desc),
             # 图片
             'pic': item.find('.item-img img').attr('src'),
             # 房源真实性
             'authenticity': item.find('.house-title .house-icon').text(),
             'report_date': self.report_date
         }
         # 加入列表中
         house_list.append(house_info)
     self.total += len(house_list)
     # 批量存入MongoDB中
     self.collection.insert(house_list)
     # 获取下一页,如果有下一页的,继续爬取下一页的内容
     next_url = doc('.multi-page .aNxt').attr('href')
     if next_url:
         # 引用上一个访问地址
         self.referer = url
         time.sleep(2)
         self.sale_log.logger.info('next => %s' % next_url)
         self.analyze_info(next_url)
Esempio n. 4
0
 def crawl_daili66(self, page_count=4):
     """
     获取代理66
     :param page_count: 页码
     :return: 代理
     """
     start_url = 'http://www.66ip.cn/{}.html'
     urls = [start_url.format(page) for page in range(1, page_count + 1)]
     for url in urls:
         print('Crawling', url)
         html = get_page(url)
         if html:
             doc = pyQuery(html)
             trs = doc('.containerbox table tr:gt(0)').items()
             for tr in trs:
                 ip = tr.find('td:nth-child(1)').text()
                 port = tr.find('td:nth-child(2)').text()
                 yield ':'.join([ip, port])
Esempio n. 5
0
## 工单处理地址
operateAddr = 'http://omssd.sky-mobi.com:9040/oms-workflow/workflow/rundeploy/busiOpt!sendRetJSONStr.action'

reqHeader = {'Cookie': cookie}
req = urllib2.Request(addr, headers=reqHeader)

print('正在读取任务列表, 等待响应, 大约需要10秒...')
result = urllib2.urlopen(req).read()
## print(result)
print('任务列表获取成功, 正在解析...')

###########################################################################################################

taskList = []
## 与jQ不同的是, pyQuery至少需要先实例化得到目标对象才行(毕竟不是内嵌的代码)
pyQuery = pyQuery(result)
## items()返回一个生成器, 适合用来处理list类型的元素
for item in pyQuery('.datalist').items():
    ele = item.find('td')
    firField = ele.eq(0).find('input')
    task = {
        'taskId': firField.eq(3).attr('value'),
        'workorderDefinitionKey': firField.eq(2).attr('value'),
        'orderId': ele.eq(1).text(),  ## 序号
        'invoiceId': ele.eq(2).text(),  ## 工单编号
        'invoiceName': ele.eq(3).text(),  ## 工单名称
        'customer': ele.eq(4).text(),  ## 提交人
        'project': ele.eq(5).text(),  ## 项目名称
        'createDate': ele.eq(6).text(),  ## 创建时间
        'status': ele.eq(7).text(),  ## 工单状态
        'detail': ele.eq(8).text()  ## 详情描述
Esempio n. 6
0
from pyquery import PyQuery as pyQuery

p = pyQuery('<p id="hello" class="hello"></p>')('p')
p.addClass("todo")
print(repr(p))
p.toggleClass("titi toto")
print(repr(p))
p.removeClass("titi")
print(repr(p))

p.css("front-size", "15px")
print(repr(p))
print(p.attr('style'))
p.css({"front-size": "17px"})
print(repr(p))
print(p.attr('style'))

# same the python way
p.css.front_size = "15px"
print(p.attr.style)
p.css["front-size"] = "15px"
print(p.attr.style)
p.css(front_size="16px")
p.css = {"front-size": "15px"}
print(p.attr.style)
def get_article(url):
    r = requests.get(url)
    document = pyquery.pyQuery(r.text)
    return document('#js_content').text()
Esempio n. 8
0
# -*- encoding: utf-8 -*-
from pyquery import  PyQuery as pyQuery

p = pyQuery('<p id="hello" class="hello"></p>')('p')
print(p.attr('id'))
print(p.attr("id", "plop"))
print(p.attr("id", "hello"))

# more python way
p.attr.id ="plop"
print(p.attr.id)
p.attr['id']="ola"
print(p.attr['id'])
p.attr(id="hello",class_="hello2")
print(repr(p))
p.attr.class_ ="hello"
print(repr(p))

item = pyQuery('<li class="ls-disc-li jpriced" levenum="56" fsid="1591295333" fpid="1729504036">'+
               '<div class="img"> '+
               '<a target="_blank" href="//item.jd.com/1591295333.html"><img title="鸿星尔克男鞋夏秋防滑减震耐磨休闲鞋系带低帮MD大底运动鞋男 孔雀蓝/正黄. 41" alt="鸿星尔克男鞋夏秋防滑减震耐磨休闲鞋系带低帮MD大底运动鞋男 孔雀蓝/正黄. 41" src="//img14.360buyimg.com/n7/jfs/t1342/63/1413028126/240422/f6459617/55c45c99Nfb7344a0.jpg" data-lazyload="done"></a>'+
               '</div>  '+
               '<h3 class="title"><a target="_blank" href="//item.jd.com/1591295333.html" title="鸿星尔克男鞋夏秋防滑减震耐磨休闲鞋系带低帮MD大底运动鞋男 孔雀蓝/正黄. 41">鸿星尔克男鞋夏秋防滑减震耐磨休闲鞋系带低帮MD大底运动鞋男 孔雀蓝/正黄. 41</a></h3>'+
               '<p class="price-p">¥<strong>279</strong><del>京东价:<del>¥279</del></del></p>   '+
               '<p class="rank">   '+
               '<em>专享会员:</em>        '+
               '<i class="rank18 rank18-2" title="铜牌会员"></i>  '+
               '<i class="rank18 rank18-3" title="银牌会员"></i>  '+
               '<i class="rank18 rank18-4" title="金牌会员"></i>  '+
               '<i class="rank18 rank18-5" title="钻石会员"></i> '+
               '</p> '+