def parse(self, response): print("----------------------------------------------------------------------------") # 引用提取器 bbsExtra = GsExtractor() # 设置xslt抓取规则 bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "安居客_房源") # 调用extract方法提取所需内容 result = bbsExtra.extractHTML(response.body) # 打印采集结果 print(str(result).encode('gbk','ignore').decode('gbk')) # 保存采集结果 file_path = os.getcwd() + "/anjuke-result.xml" open(file_path,"wb").write(result) # 打印结果存放路径 print("采集结果文件:" + file_path)
def parse(self, response): print("-" * 10) # 引入Gooseeker最新规则提取器 bbsExtra = GsExtractor() # 设置xslt抓取规则 bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "安居客_房源") # 调用extract方法提取所需内容 result = bbsExtra.extractHTML(response.body) # 打印采集结果 res = str(result).encode('gbk', 'ignore').decode('gbk') print(res) # 保存采集结果 file_path = os.getcwd() + "/anjuke-result.xml" open(file_path, "wb").write(result) # 打印结果存放路径 print("采集结果文件:" + file_path)
def parse(self, response): print("start...") # start browser self.browser.get(response.url) # loading time interval time.sleep(3) # get xslt extra = GsExtractor() extra.setXsltFromAPI("API KEY", "淘宝天猫_商品详情30474") # get doc html = self.browser.execute_script("return document.documentElement.outerHTML"); doc = etree.HTML(html) result = extra.extract(doc) # out file file_name = 'E:/淘宝天猫_商品详情30474_' + self.getTime() + '.xml' open(file_name, "wb").write(result) self.browser.close() print("end")
def parse(self, response): print("start...") #start browser self.browser.get(response.url) #loading time interval time.sleep(3) #get xslt extra=GsExtractor() extra.setXsltFromAPI("0a3898683f265e7b28991e0615228baa", "淘宝天猫_商品详情30474") # get doc html = self.browser.execute_script("return document.documentElement.outerHTML") doc = etree.HTML(html) result = extra.extract(doc) # out file file_name = 'F:/temp/淘宝天猫_商品详情30474_' + self.getTime() + '.xml' open(file_name,"wb").write(result) self.browser.close() print("end")
class webCon: # chrome启动的慢,所以尽量少启动,作为静态最合适 chrome = chromes() # 只会实例化一次 mylog = Mylog("WebCon") def __init__(self): # 集搜客提供的API self.extra = GsExtractor() # ip代理商提供的服务 self.Ipagency = IpAgency() def con_ThenGetContent(self, url, rule): try: # chrome 浏览器方式采集数据 data = webCon.chrome.get_html(url=url) doc = etree.HTML(data) self.extra.setXsltFromAPI(theme=rule) # 获取规则文件,并读取 content = self.extra.extract(doc) # 更具xslt 解析的内容,xml 字节格式 return content except Exception as e: # 最好加一个日志 webCon.mylog.debug("chrome hava some problem:" + str(e)) rd = RandomUserAgent() headers = {"User-Agent": rd.get_RanDomAgent()} try: # requests 代理方式采集数据 data = requests.get(url=url, headers=headers, timeout=500, proxies=self.Ipagency.getIpProxy()).text # data = requests.get(url=url, headers=headers,timeout=500).text #不用代理方式 doc = etree.HTML(data) # doc = etree.HTML(conn.read()) self.extra.setXsltFromAPI(theme=rule) # 获取规则文件,并读取 content = self.extra.extract(doc) # 更具xslt 解析的内容,xml 字节格式 return content except Exception as e1: # 这里最好加一个日志 print("chrome hava some problem:" + str(e1)) try: # 阿布云代理IP出错 requests 本机IP采集数据 data = requests.get(url=url, headers=headers, timeout=500).text # 不用代理方式 doc = etree.HTML(data) self.extra.setXsltFromAPI(theme=rule) # 获取规则文件,并读取 content = self.extra.extract(doc) # 更具xslt 解析的内容,xml 字节格式 return content except Exception as e2: # 加一个日志操作,warning webCon.mylog.error(str.encode((str(url) + ":" + str(e2)))) print(str.encode((str(url) + ":" + str(e2)))) return None def getextra(self): return self.extra
# _*_coding:utf8_*_ # crawler_gooseeker_bbs.py # 版本: V1.0 from urllib import request from lxml import etree from gooseeker import GsExtractor # 访问并读取网页内容 url = "http://www.gooseeker.com/cn/forum/7" conn = request.urlopen(url) doc = etree.HTML(conn.read()) bbsExtra = GsExtractor() bbsExtra.setXsltFromAPI( "98adf83ksdf0slrwaerwersdkfjsa", "gooseeker_bbs_xslt") # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请 result = bbsExtra.extract(doc) # 调用extract方法提取所需内容 print(str(result))
def getContent(self, url): conn = request.urlopen(url) output = etree.HTML(conn.read()) return output def saveContent(self, filepath, content): file_obj = open(filepath, 'w', encoding='UTF-8') file_obj.write(content) file_obj.close() bbsExtra = GsExtractor() # 下面这句调用gooseeker的api来设置xslt抓取规则 # 第一个参数是app key,请到GooSeeker会员中心申请 # 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的 bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "安居客房产经纪人") url = "http://shenzhen.anjuke.com/tycoon/nanshan/p" totalpages = 50 anjukeSpider = Spider() print("爬取开始") for pagenumber in range(1, totalpages): currenturl = url + str(pagenumber) print("正在爬取", currenturl) content = anjukeSpider.getContent(currenturl) outputxml = bbsExtra.extract(content) outputfile = "result" + str(pagenumber) + ".xml" anjukeSpider.saveContent(outputfile, str(outputxml)) print("爬取结束")
# _*_coding:utf8_*_ # crawler_gooseeker_bbs.py # 版本: V1.0 from urllib import request from lxml import etree from gooseeker import GsExtractor # 访问并读取网页内容 url = "http://www.gooseeker.com/cn/forum/7" conn = request.urlopen(url) doc = etree.HTML(conn.read()) bbsExtra = GsExtractor() bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e" , "gooseeker_bbs_xslt") # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请 result = bbsExtra.extract(doc) # 调用extract方法提取所需内容 print(str(result))
# _*_coding:utf8_*_ # crawler_gooseeker_bbs.py # 版本: V1.0 from urllib import request from lxml import etree from gooseeker import GsExtractor # 访问并读取网页内容 url = "http://www.gooseeker.com/cn/forum/7" conn = request.urlopen(url) doc = etree.HTML(conn.read()) bbsExtra = GsExtractor() bbsExtra.setXsltFromAPI("98adf83ksdf0slrwaerwersdkfjsa" , "gooseeker_bbs_xslt") # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请 result = bbsExtra.extract(doc) # 调用extract方法提取所需内容 print(str(result))
class Spider: def getContent(self, url): conn = request.urlopen(url) output = etree.HTML(conn.read()) return output def saveContent(self, filepath, content): file_obj = open(filepath, 'w', encoding='UTF-8') file_obj.write(content) file_obj.close() bbsExtra = GsExtractor() bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "安居客房产经纪人") # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请 url = "http://shenzhen.anjuke.com/tycoon/nanshan/p" anjukeSpider = Spider() print("爬取开始") for pagenumber in range(1, totalpages): currenturl = url + str(pagenumber) print("正在爬取", currenturl) content = anjukeSpider.getContent(currenturl) outputxml = bbsExtra.extract(content) outputfile = "result" + str(pagenumber) + ".xml" anjukeSpider.saveContent(outputfile, str(outputxml)) print("爬取结束")
# -*- coding: utf-8 -*- # 使用gsExtractor类的示例程序 # 访问集搜客论坛,以xslt为模板提取论坛内容 # xslt保存在xslt_bbs.xml中 # 采集结果保存在result.xml中 import os from urllib import request import lxml from lxml import etree import gooseeker from gooseeker import GsExtractor # 访问并读取网页内容 url = "http://www.gooseeker.com/cn/forum/7" conn = request.urlopen(url) doc = etree.HTML(conn.read()) bbsExtra = GsExtractor() bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "gooseeker_bbs_xslt") # 设置xslt抓取规则 result = bbsExtra.extract(doc) # 调用extract方法提取所需内容 # 当前目录 current_path = os.getcwd() file_path = current_path + "/result.xml" # 保存结果 open(file_path, "wb").write(result)
# -*- coding: utf-8 -*- # 使用GsExtractor类的示例程序 # 以webdriver驱动Firefox采集亚马逊商品列表 # xslt保存在xslt_bbs.xml中 # 采集结果保存在third文件夹中 import os import time from lxml import etree from selenium import webdriver from gooseeker import GsExtractor # 引用提取器 bbsExtra = GsExtractor() bbsExtra.setXsltFromAPI("a0056d16ff3003ae9d5b48bcfa54f4af", "newsCrawler") # 设置xslt抓取规则 # 创建存储结果的目录 current_path = os.getcwd() res_path = current_path + "/third-result" if os.path.exists(res_path): pass else: os.mkdir(res_path) # 驱动火狐 driver = webdriver.Firefox() url = "https://www.amazon.cn/s/ref=sr_pg_1?rh=n%3A658390051%2Cn%3A!658391051%2Cn%3A658414051%2Cn%3A658810051&page=1&ie=UTF8&qid=1476258544" driver.get(url) time.sleep(2) # 获取网页内容
totalpages = 50 class Spider: def getContent(self, url): conn = request.urlopen(url) output = etree.HTML(conn.read()) return output def saveContent(self, filepath, content): file_obj = open(filepath, 'w', encoding='UTF-8') file_obj.write(content) file_obj.close() bbsExtra = GsExtractor() bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e" , "安居客房产经纪人") # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请 url = "http://shenzhen.anjuke.com/tycoon/nanshan/p" anjukeSpider = Spider() print("爬取开始") for pagenumber in range(1 , totalpages): currenturl = url + str(pagenumber) print("正在爬取", currenturl) content = anjukeSpider.getContent(currenturl) outputxml = bbsExtra.extract(content) outputfile = "result" + str(pagenumber) + ".xml" anjukeSpider.saveContent(outputfile , str(outputxml)) print("爬取结束")
# -*_coding:utf-8-*- # 使用GsExtractor类的示例程序 # 访问集搜客论坛,以xslt为模板提取论坛内容 # xslt保存在xslt_bbs.xml中 from urllib import request from lxml import etree from gooseeker import GsExtractor import html # 访问并读取网页内容 url = "http://im.nju.edu.cn/teachers.do?type=1&mid=4" conn = request.urlopen(url) doc = etree.HTML(conn.read()) bbsExtra = GsExtractor() # 生成xsltExtractor对象 bbsExtra.setXsltFromAPI("e346796c93c6ba7441636666e401e5cc", "im.nju.edu.cn") xs = bbsExtra.getXslt() result = bbsExtra.extract(doc) # 调用extract方法提取所需内容 # out file file_name = 'E:/parse_detail_' + '.xml' open(file_name, "w").write(result) print(result)
browser.get(url) time.sleep(3) html = browser.execute_script("return document.documentElement.outerHTML") output = etree.HTML(html) return output def saveContent(self, filepath, content): file_obj = open(filepath, 'w', encoding='UTF-8') file_obj.write(content) file_obj.close() doubanExtra = GsExtractor() # 下面这句调用gooseeker的api来设置xslt抓取规则 # 第一个参数是app key,请到GooSeeker会员中心申请 # 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的 doubanExtra.setXsltFromAPI("ffd5273e213036d812ea298922e2627b" , "豆瓣小组讨论话题") url = "https://www.douban.com/group/haixiuzu/discussion?start=" totalpages = 5 doubanSpider = PhantomSpider() print("爬取开始") for pagenumber in range(1 , totalpages): currenturl = url + str((pagenumber-1)*25) print("正在爬取", currenturl) content = doubanSpider.getContent(currenturl) outputxml = doubanExtra.extract(content) outputfile = "result" + str(pagenumber) +".xml" doubanSpider.saveContent(outputfile , str(outputxml)) print("爬取结束")
html = browser.execute_script( "return document.documentElement.outerHTML") output = etree.HTML(html) return output def saveContent(self, filepath, content): file_obj = open(filepath, 'w', encoding='UTF-8') file_obj.write(content) file_obj.close() doubanExtra = GsExtractor() # 下面这句调用gooseeker的api来设置xslt抓取规则 # 第一个参数是app key,请到GooSeeker会员中心申请 # 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的 doubanExtra.setXsltFromAPI("ffd5273e213036d812ea298922e2627b", "豆瓣小组讨论话题") url = "https://www.douban.com/group/haixiuzu/discussion?start=" totalpages = 5 doubanSpider = PhantomSpider() print("爬取开始") for pagenumber in range(1, totalpages): currenturl = url + str((pagenumber - 1) * 25) print("正在爬取", currenturl) content = doubanSpider.getContent(currenturl) outputxml = doubanExtra.extract(content) outputfile = "result" + str(pagenumber) + ".xml" doubanSpider.saveContent(outputfile, str(outputxml)) print("爬取结束")