Esempio n. 1
0
 def getContent(self, url):
     browser = webdriver.PhantomJS(executable_path=self.phantomjsPath)
     browser.get(url)
     time.sleep(self.waittime)
     html = browser.execute_script(
         "return document.documentElement.outerHTML")
     doc = etree.HTML(html)
     jdlistExtra = GsExtractor()
     jdlistExtra.setXsltFromFile("jd_list.xml")
     output = jdlistExtra.extract(doc)
     return output
Esempio n. 2
0
class webCon:
    # chrome启动的慢,所以尽量少启动,作为静态最合适
    chrome = chromes()  # 只会实例化一次
    mylog = Mylog("WebCon")

    def __init__(self):
        # 集搜客提供的API
        self.extra = GsExtractor()

        # ip代理商提供的服务
        self.Ipagency = IpAgency()

    def con_ThenGetContent(self, url, rule):
        try:
            # chrome 浏览器方式采集数据
            data = webCon.chrome.get_html(url=url)
            doc = etree.HTML(data)
            self.extra.setXsltFromAPI(theme=rule)  # 获取规则文件,并读取
            content = self.extra.extract(doc)  # 更具xslt 解析的内容,xml 字节格式
            return content
        except Exception as e:
            # 最好加一个日志
            webCon.mylog.debug("chrome hava some problem:" + str(e))
            rd = RandomUserAgent()
            headers = {"User-Agent": rd.get_RanDomAgent()}
            try:
                # requests 代理方式采集数据
                data = requests.get(url=url,
                                    headers=headers,
                                    timeout=500,
                                    proxies=self.Ipagency.getIpProxy()).text
                # data = requests.get(url=url, headers=headers,timeout=500).text  #不用代理方式
                doc = etree.HTML(data)
                # doc = etree.HTML(conn.read())
                self.extra.setXsltFromAPI(theme=rule)  # 获取规则文件,并读取
                content = self.extra.extract(doc)  # 更具xslt 解析的内容,xml 字节格式
                return content
            except Exception as e1:
                # 这里最好加一个日志
                print("chrome hava some problem:" + str(e1))
                try:
                    # 阿布云代理IP出错 requests 本机IP采集数据
                    data = requests.get(url=url, headers=headers,
                                        timeout=500).text  # 不用代理方式
                    doc = etree.HTML(data)
                    self.extra.setXsltFromAPI(theme=rule)  # 获取规则文件,并读取
                    content = self.extra.extract(doc)  # 更具xslt 解析的内容,xml 字节格式
                    return content
                except Exception as e2:
                    # 加一个日志操作,warning
                    webCon.mylog.error(str.encode((str(url) + ":" + str(e2))))
                    print(str.encode((str(url) + ":" + str(e2))))
                    return None

    def getextra(self):
        return self.extra
Esempio n. 3
0
 def parse(self, response):
     print("----------------------------------------------------------------------------")
     # 引用提取器
     bbsExtra = GsExtractor()
     # 设置xslt抓取规则
     bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "安居客_房源")
     # 调用extract方法提取所需内容
     result = bbsExtra.extractHTML(response.body)
     # 打印采集结果
     print(str(result).encode('gbk','ignore').decode('gbk'))
     # 保存采集结果
     file_path = os.getcwd() + "/anjuke-result.xml"
     open(file_path,"wb").write(result)
     # 打印结果存放路径
     print("采集结果文件:" + file_path)
Esempio n. 4
0
 def parse(self, response):
     print("-" * 10)
     # 引入Gooseeker最新规则提取器
     bbsExtra = GsExtractor()
     # 设置xslt抓取规则
     bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "安居客_房源")
     # 调用extract方法提取所需内容
     result = bbsExtra.extractHTML(response.body)
     # 打印采集结果
     res = str(result).encode('gbk', 'ignore').decode('gbk')
     print(res)
     # 保存采集结果
     file_path = os.getcwd() + "/anjuke-result.xml"
     open(file_path, "wb").write(result)
     # 打印结果存放路径
     print("采集结果文件:" + file_path)
Esempio n. 5
0
 def parse(self, response):
     print("start...")
     # start browser
     self.browser.get(response.url)
     # loading time interval
     time.sleep(3)
     # get xslt
     extra = GsExtractor()
     extra.setXsltFromAPI("API KEY", "淘宝天猫_商品详情30474")
     # get doc
     html = self.browser.execute_script("return document.documentElement.outerHTML");
     doc = etree.HTML(html)
     result = extra.extract(doc)
     # out file
     file_name = 'E:/淘宝天猫_商品详情30474_' + self.getTime() + '.xml'
     open(file_name, "wb").write(result)
     self.browser.close()
     print("end")
Esempio n. 6
0
 def parse(self, response):
     print("start...")
     #start browser
     self.browser.get(response.url)
     #loading time interval
     time.sleep(3)
     #get xslt
     extra=GsExtractor()
     extra.setXsltFromAPI("0a3898683f265e7b28991e0615228baa", "淘宝天猫_商品详情30474")
     # get doc
     html = self.browser.execute_script("return document.documentElement.outerHTML")
     doc = etree.HTML(html)
     result = extra.extract(doc)
     
     # out file
     file_name = 'F:/temp/淘宝天猫_商品详情30474_' + self.getTime() + '.xml'
     open(file_name,"wb").write(result)
     self.browser.close()
     print("end")
Esempio n. 7
0
from gooseeker import GsExtractor


class Spider:
    def getContent(self, url):
        conn = request.urlopen(url)
        output = etree.HTML(conn.read())
        return output

    def saveContent(self, filepath, content):
        file_obj = open(filepath, 'w', encoding='UTF-8')
        file_obj.write(content)
        file_obj.close()


bbsExtra = GsExtractor()
# 下面这句调用gooseeker的api来设置xslt抓取规则
# 第一个参数是app key,请到GooSeeker会员中心申请
# 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的
bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "安居客房产经纪人")

url = "http://shenzhen.anjuke.com/tycoon/nanshan/p"
totalpages = 50
anjukeSpider = Spider()
print("爬取开始")

for pagenumber in range(1, totalpages):
    currenturl = url + str(pagenumber)
    print("正在爬取", currenturl)
    content = anjukeSpider.getContent(currenturl)
    outputxml = bbsExtra.extract(content)
Esempio n. 8
0
# _*_coding:utf8_*_
# crawler_gooseeker_bbs.py
# 版本: V1.0

from urllib import request
from lxml import etree
from gooseeker import GsExtractor

# 访问并读取网页内容
url = "http://www.gooseeker.com/cn/forum/7"
conn = request.urlopen(url)
doc = etree.HTML(conn.read())

bbsExtra = GsExtractor()
bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e" , "gooseeker_bbs_xslt")   # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请
result = bbsExtra.extract(doc)   # 调用extract方法提取所需内容

print(str(result))

# _*_coding:utf8_*_
# crawler_gooseeker_bbs.py
# 版本: V1.0

from urllib import request
from lxml import etree
from gooseeker import GsExtractor

# 访问并读取网页内容
url = "http://www.gooseeker.com/cn/forum/7"
conn = request.urlopen(url)
doc = etree.HTML(conn.read())

bbsExtra = GsExtractor()   
bbsExtra.setXsltFromAPI("98adf83ksdf0slrwaerwersdkfjsa" , "gooseeker_bbs_xslt")   # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请
result = bbsExtra.extract(doc)   # 调用extract方法提取所需内容

print(str(result))

totalpages = 50


class Spider:
    def getContent(self, url):
        conn = request.urlopen(url)
        output = etree.HTML(conn.read())
        return output

    def saveContent(self, filepath, content):
        file_obj = open(filepath, 'w', encoding='UTF-8')
        file_obj.write(content)
        file_obj.close()


bbsExtra = GsExtractor()
bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e",
                        "安居客房产经纪人")  # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请

url = "http://shenzhen.anjuke.com/tycoon/nanshan/p"
anjukeSpider = Spider()
print("爬取开始")

for pagenumber in range(1, totalpages):
    currenturl = url + str(pagenumber)
    print("正在爬取", currenturl)
    content = anjukeSpider.getContent(currenturl)
    outputxml = bbsExtra.extract(content)
    outputfile = "result" + str(pagenumber) + ".xml"
    anjukeSpider.saveContent(outputfile, str(outputxml))
Esempio n. 11
0
    def __init__(self):
        # 集搜客提供的API
        self.extra = GsExtractor()

        # ip代理商提供的服务
        self.Ipagency = IpAgency()
#-*_coding:utf8-*-
# 使用gsExtractor类的示例程序
# 访问集搜客论坛,以xslt为模板提取论坛内容
# xslt保存在xslt_bbs.xml中
from urllib import request
from lxml import etree
from gooseeker import GsExtractor

# 访问并读取网页内容
url = "http://www.gooseeker.com/cn/forum/7"
conn = request.urlopen(url)
doc = etree.HTML(conn.read())

# 生成xsltExtractor对象
bbsExtra = GsExtractor()
# 调用set方法设置xslt内容
bbsExtra.setXsltFromFile("xslt_bbs.xml")
# 调用extract方法提取所需内容
result = bbsExtra.extract(doc)
# 显示提取结果
print(str(result))
Esempio n. 13
0
from gooseeker import GsExtractor

totalpages = 50

class Spider:
    def getContent(self, url):
        conn = request.urlopen(url)
        output = etree.HTML(conn.read())
        return output

    def saveContent(self, filepath, content):
        file_obj = open(filepath, 'w', encoding='UTF-8')
        file_obj.write(content)
        file_obj.close()

bbsExtra = GsExtractor()   
bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e" , "安居客房产经纪人")   # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请

url = "http://shenzhen.anjuke.com/tycoon/nanshan/p"
anjukeSpider = Spider()
print("爬取开始")

for pagenumber in range(1 , totalpages):
    currenturl = url + str(pagenumber)
    print("正在爬取", currenturl)
    content = anjukeSpider.getContent(currenturl)
    outputxml = bbsExtra.extract(content)
    outputfile = "result" + str(pagenumber) + ".xml"
    anjukeSpider.saveContent(outputfile , str(outputxml))

print("爬取结束")
Esempio n. 14
0
# -*_coding:utf-8-*-
# 使用GsExtractor类的示例程序
# 访问集搜客论坛,以xslt为模板提取论坛内容
# xslt保存在xslt_bbs.xml中
from urllib import request
from lxml import etree
from gooseeker import GsExtractor
import html

# 访问并读取网页内容
url = "http://im.nju.edu.cn/teachers.do?type=1&mid=4"
conn = request.urlopen(url)
doc = etree.HTML(conn.read())

bbsExtra = GsExtractor()  # 生成xsltExtractor对象
bbsExtra.setXsltFromAPI("e346796c93c6ba7441636666e401e5cc", "im.nju.edu.cn")
xs = bbsExtra.getXslt()
result = bbsExtra.extract(doc)  # 调用extract方法提取所需内容
# out file
file_name = 'E:/parse_detail_' + '.xml'
open(file_name, "w").write(result)
print(result)
Esempio n. 15
0
class PhantomSpider:
    def getContent(self, url):
        browser = webdriver.PhantomJS(executable_path='C:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe')
        browser.get(url)
        time.sleep(3)
        html = browser.execute_script("return document.documentElement.outerHTML")
        output = etree.HTML(html)
        return output

    def saveContent(self, filepath, content):
        file_obj = open(filepath, 'w', encoding='UTF-8')
        file_obj.write(content)
        file_obj.close()

doubanExtra = GsExtractor()   
# 下面这句调用gooseeker的api来设置xslt抓取规则
# 第一个参数是app key,请到GooSeeker会员中心申请
# 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的
doubanExtra.setXsltFromAPI("ffd5273e213036d812ea298922e2627b" , "豆瓣小组讨论话题")  

url = "https://www.douban.com/group/haixiuzu/discussion?start="
totalpages = 5
doubanSpider = PhantomSpider()
print("爬取开始")

for pagenumber in range(1 , totalpages):
    currenturl = url + str((pagenumber-1)*25)
    print("正在爬取", currenturl)
    content = doubanSpider.getContent(currenturl)
    outputxml = doubanExtra.extract(content)
Esempio n. 16
0
# -*- coding: utf-8 -*-
# 使用GsExtractor类的示例程序
# 以webdriver驱动Firefox采集亚马逊商品列表
# xslt保存在xslt_bbs.xml中
# 采集结果保存在third文件夹中
import os
import time
from lxml import etree
from selenium import webdriver
from gooseeker import GsExtractor

# 引用提取器
bbsExtra = GsExtractor()
bbsExtra.setXsltFromAPI("a0056d16ff3003ae9d5b48bcfa54f4af",
                        "newsCrawler")  # 设置xslt抓取规则

# 创建存储结果的目录
current_path = os.getcwd()
res_path = current_path + "/third-result"
if os.path.exists(res_path):
    pass
else:
    os.mkdir(res_path)

# 驱动火狐
driver = webdriver.Firefox()
url = "https://www.amazon.cn/s/ref=sr_pg_1?rh=n%3A658390051%2Cn%3A!658391051%2Cn%3A658414051%2Cn%3A658810051&page=1&ie=UTF8&qid=1476258544"
driver.get(url)
time.sleep(2)

# 获取网页内容
Esempio n. 17
0
# -*- coding: utf-8 -*-
# 使用gsExtractor类的示例程序
# 访问集搜客论坛,以xslt为模板提取论坛内容
# xslt保存在xslt_bbs.xml中
# 采集结果保存在result.xml中

import os
from urllib import request
import lxml
from lxml import etree
import gooseeker
from gooseeker import GsExtractor

# 访问并读取网页内容
url = "http://www.gooseeker.com/cn/forum/7"
conn = request.urlopen(url)
doc = etree.HTML(conn.read())

bbsExtra = GsExtractor()
bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e",
                        "gooseeker_bbs_xslt")  # 设置xslt抓取规则
result = bbsExtra.extract(doc)  # 调用extract方法提取所需内容

# 当前目录
current_path = os.getcwd()
file_path = current_path + "/result.xml"

# 保存结果
open(file_path, "wb").write(result)
Esempio n. 18
0
# _*_coding:utf8_*_
# crawler_gooseeker_bbs.py
# 版本: V1.0

from urllib import request
from lxml import etree
from gooseeker import GsExtractor

# 访问并读取网页内容
url = "http://www.gooseeker.com/cn/forum/7"
conn = request.urlopen(url)
doc = etree.HTML(conn.read())

bbsExtra = GsExtractor()
bbsExtra.setXsltFromAPI(
    "98adf83ksdf0slrwaerwersdkfjsa",
    "gooseeker_bbs_xslt")  # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请
result = bbsExtra.extract(doc)  # 调用extract方法提取所需内容

print(str(result))
        browser = webdriver.PhantomJS(
            executable_path='C:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe')
        browser.get(url)
        time.sleep(3)
        html = browser.execute_script(
            "return document.documentElement.outerHTML")
        output = etree.HTML(html)
        return output

    def saveContent(self, filepath, content):
        file_obj = open(filepath, 'w', encoding='UTF-8')
        file_obj.write(content)
        file_obj.close()


doubanExtra = GsExtractor()
# 下面这句调用gooseeker的api来设置xslt抓取规则
# 第一个参数是app key,请到GooSeeker会员中心申请
# 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的
doubanExtra.setXsltFromAPI("ffd5273e213036d812ea298922e2627b", "豆瓣小组讨论话题")

url = "https://www.douban.com/group/haixiuzu/discussion?start="
totalpages = 5
doubanSpider = PhantomSpider()
print("爬取开始")

for pagenumber in range(1, totalpages):
    currenturl = url + str((pagenumber - 1) * 25)
    print("正在爬取", currenturl)
    content = doubanSpider.getContent(currenturl)
    outputxml = doubanExtra.extract(content)