def parse_page(self, response): prototype = response.meta['prototype'] item = MI.FirmcrawlerItem(prototype) tables = response.xpath( '//div[@class="list"]/table/tr[4]//a/@href').extract() absurl = urlparse.urljoin(response.url, tables[0]) filename = tables[0].split('/')[-1].replace(".online", "") softname = response.xpath('//div[@class="list"]/table/tr[1]/td/text()' ).extract().pop().strip() version = re.search('[V,v]?\d\.\d\.\d\.*\d*', softname) if version: version = version.group() else: version = "" if version: model = softname.split(" ")[-1].split(version)[0] else: model = softname model = unicode.encode(model, encoding='utf8').replace("-", "").replace("_", "") item["productVersion"] = version item["publishTime"] = "" item["productClass"] = "" item["productModel"] = model.replace("-", "").replace("_", "") item["description"] = "" item["url"] = absurl item["firmwareName"] = filename item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S") yield item print "firmwarename:", item["firmwareName"]
def parse_list(self, response): prototype = response.meta['prototype'] lines = response.xpath( '//div[@id="c_companyFile_list-15086754191809347"]/div/div[1]/div') for line in lines: filename = line.xpath( './div/div[2]/div[1]/a/h3/div/text()').extract().pop() # print filename productModel = filename.split("升级")[0] # print productModel publishTime = line.xpath( './div/div[2]/div[4]/div/div/text()').extract().pop() # print publishTime ###http://www.netcoretec.com/comp/companyFile/download.do?fid=104&appId=24&id=98 ###http://www.netcoretec.com/comp/companyFile/download.do?fid=103&appId=24&id=97# ###在网页上很难找到这两个参数(使用javascript内容,仔细找找还是能找得到的) cid = line.xpath('./div/a/@cid').extract().pop() data = line.xpath('./div/a/@data').extract().pop() # print cid,data absurl = "http://www.netcoretec.com/comp/companyFile/download.do?fid=" + str( cid) + "&appId=24&id=" + str(data) item = MI.FirmcrawlerItem(prototype) item["firmwareName"] = filename item["url"] = absurl item["productVersion"] = "" item["publishTime"] = publishTime item["productClass"] = "" item["productModel"] = productModel item["description"] = "" item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S") yield item print "firmwarename:", item["firmwareName"]
def parse(self, response): table_list = response.xpath("//html/body/div[3]/div/div[2]/table") for table in table_list: #html/body/div[3]/div/div[2]/table[1]/tbody/tr[1]/td[1]/strong filename = table.xpath( "./tbody/tr[1]/td[1]/strong/text()").extract().pop() description = table.xpath("./tbody/tr[2]/td/p/text()").extract() desc = "" for d in description: desc = desc + d.strip() # print desc absurl = table.xpath( "./tbody/tr[1]/td[2]/strong/a/@href").extract().pop() # print absurl #http://www.tiandy.com/wp-content/files/Easy7SmartClientProfessionalV7.14T.zip item = MI.FirmcrawlerItem() item["firmwareName"] = filename item["publishTime"] = "" item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S") item["url"] = absurl item["description"] = desc item["productClass"] = "" item["productVersion"] = "" item["productModel"] = "" item["manufacturer"] = "tiandy" yield item print "firmwarename:", item["firmwareName"]
def parse_page(self, response): prototype = response.meta['prototype'] version = response.xpath( '//div[@class="down1-ccont"]/div[2]/p[1]/text()').extract() if version: self.productVersion = version[0] else: self.productVersion = "" description = response.xpath( '//div[@class="down1-ccont"]/div[2]/p[position()>1]/text()' ).extract() if description: self.desc = " ".join(description) else: self.desc = "" urls = response.xpath( '//div[@class="down1-ccont"]/div[2]/p[1]/a/@href').extract() if urls: url = urls[0] request = scrapy.Request(url, callback=self.parse_next) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "xiongmai" yield request
def parse_page(self, response): prototype = response.meta['prototype'] item = MI.FirmcrawlerItem(prototype) tables = response.xpath('//div[@class="sofewear"]/table[1]/tbody/tr') for t in tables: softname = t.xpath('./td[2]/a/text()').extract().pop().strip() if "驱动" not in unicode.encode(softname, encoding='utf-8'): url = t.xpath('./td[2]/a/@href').extract() absurl = urlparse.urljoin(response.url, url[0]).replace( " ", "%20").replace("(", "%28").replace(")", "%29") model = t.xpath('./td[1]/text()').extract().pop().strip() publishtime = t.xpath('./td[4]/text()').extract().pop() version = re.search("[V,v]?\d\.\d", softname) if version: version = version.group() else: version = "" item["productVersion"] = version item["publishTime"] = publishtime item["productClass"] = "" item["productModel"] = model item["description"] = softname item["url"] = absurl item["firmwareName"] = item["url"].split('/')[-1] item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S") yield item print "firmwarename:", item["firmwareName"] else: print "qudong :", softname
def parse_page(self, response): urls = response.xpath( '//div[@class="download_rig fr"]/a/@href').extract()[0] absurl = urlparse.urljoin(response.url, urls) info = response.xpath( '//div[@class="download_rig fr"]//text()').extract() info = str().join(info).strip() info = unicode.encode(info, encoding='utf8') modelt = info.split(r"路由器型号:")[-1] if "固件版本" in modelt: model = modelt.split(r"固件版本:")[0].strip() version = modelt.split(r"固件版本:")[-1].split(r"固件大小:")[0].strip() else: model = modelt.split(r"固件大小:")[0].strip() version = "" publishtime = modelt.split(r"上传日期:")[-1].split(r"软件简介:")[0].strip() desc = modelt.split(r"软件简介:")[-1].split("。")[0].strip() item = MI.FirmcrawlerItem() item["url"] = absurl try: res = urllib2.urlopen(urllib2.Request(item["url"], None), timeout=lblinkSpider.timeout) contentType = res.headers["content-type"] filename = contentType.split('\"')[1] item["firmwareName"] = filename except Exception, e: print "no firmware name" print e
def parse(self, response): for i in xrange(1, 3 + 1): #10+1 url = "http://www.jcgcn.com/list-42-22-%s/" % i request = scrapy.Request(url, callback=self.parse_list) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "jcg" yield request
def parse(self, response): ul_list = response.xpath("/html/body/div[2]/div/div/ul") # print ul_list for ul in ul_list: li_list = ul.xpath("./li") for li in li_list: version = li.xpath("./a/text()").extract().pop() # print version absurl = li.xpath("./a/@href").extract().pop() # print absurl filename = absurl.split("/")[-1] # print filename item = MI.FirmcrawlerItem() item["firmwareName"] = filename item["productVersion"] = version item["productModel"] = "" item["productClass"] = "" item["publishTime"] = "" item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S") item["url"] = absurl item["description"] = "" item["manufacturer"] = "micropython" yield item print "firmwarename:", item["firmwareName"]
def parse_page(self, response): prototype = response.meta['prototype'] item = MI.FirmcrawlerItem(prototype) tables = response.xpath( '//div[@class="table-wrap"]/table/tbody/tr[4]/td[2]/a/@href' ).extract().pop() absurl = urlparse.urljoin(response.url, tables.replace(' ', '%20')) filename = tables.split('/')[-1] softname = response.xpath( '//div[@class="table-wrap"]/table/tbody/tr[1]/td[2]//text()' ).extract().pop().strip() desc = response.xpath( '//div[@class="table-wrap"]/table/tbody/tr[5]/td[2]//text()' ).extract() publishtime = response.xpath( '//div[@class="table-wrap"]/table/tbody/tr[3]/td[2]//text()' ).extract().pop() model = softname.split(' ')[0] version = softname.split(' ')[-1].split('_')[0] item["productVersion"] = version item["publishTime"] = publishtime.strip() item["productClass"] = "" item["productModel"] = model item["description"] = str().join(desc).strip() item["url"] = absurl item["firmwareName"] = filename item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S") yield item print "firmwarename:", item["firmwareName"] return
def parse_page(self, response): li_list = response.xpath('//html/body/div[3]/div/div[2]/ul/li') for li in li_list: did_value = li.xpath('./@did').extract().pop() absurl = "http://www.cn.onkyo.com/2018newsite/Download/" + str( did_value) + ".html" # print absurl file_name = li.xpath('./a/text()').extract().pop() filename = file_name.split("固件更新")[0] publishTime = file_name.split("固件更新")[-1] # print filename # print publishTime item = MI.FirmcrawlerItem() item["firmwareName"] = filename item["productVersion"] = "" item["productModel"] = "" item["productClass"] = "" item["publishTime"] = publishTime item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S") item["url"] = absurl item["description"] = "" item["manufacturer"] = "onkyo" yield item print "firmwarename:", item["firmwareName"]
def parse(self, response): for i in xrange(1, 5 + 1): #5+1 url = "http://www.tomaxcom.com/shengjiruanjian/list_30_%s.html" % i request = scrapy.Request(url, callback=self.parse_list) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "tomax" yield request
def parse_list(self, response): tables = response.xpath('//div[@class="list"]/dl//@href').extract() for t in tables: url = urlparse.urljoin(response.url, t) request = scrapy.Request(url, callback=self.parse_page) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "tg-net" yield request
def parse(self, response): for i in xrange(1, 12 + 1): #12+1 url = "http://www.tg-net.cn/download_106_%s.html" % i # print "url:",url request = scrapy.Request(url, callback=self.parse_list) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "TG-NET" yield request
def parse(self, response): for i in xrange(1, 25): #20+1 url = "http://service.mercurycom.com.cn/download-list.html?p=%s" % i # print "url:",url request = scrapy.Request(url, callback=self.parse_list) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "mercury" yield request
def parse(self, response): for i in xrange(1, 18 + 1): url = "http://service.fastcom.com.cn/download-list.html?classTip=software&p=%s&o=1&ajax=True&paging=False" % i # print "url:",url request = scrapy.Request(url, callback=self.parse_list) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "fast" yield request
def parse_page(self, response): # print response.url productModel = response.xpath( ".//section[@id='page-title']/div/h1/text()").extract().pop( ).split("固件")[0] # print productModel div1 = response.xpath( ".//div[@id='posts']/div[@class='entry clearfix']") # print len(div1) div2 = response.xpath( ".//div[@id='posts']/div[@class='entry clearfix alt']") # print len(div2) div = div1 + div2 # print len(div) for d in div: absurl = d.xpath("./div[2]/div/div/a[1]/@href").extract().pop() # print absurl filename = d.xpath("./div[2]/div/div/a[1]/text()").extract().pop() desc_info = d.xpath("./div[2]/div/div") desc_ = desc_info.xpath('string(.)').extract() desc = "" for description in desc_: description_ = description.strip() desc = desc + description_ # print desc # print filename version = re.search("v\d.+", filename) if version: productVersion = version.group() else: productVersion = "" # print productVersion publish_Time = d.xpath( "./div[2]/div/div/div/ul/li[1]/text()").extract().pop() publishTime = "" try: array = time.strptime(publish_Time, u"%Y年%m月%d日") publishTime = time.strftime("%Y-%m-%d", array) except Exception, e: print e # print publishTime item = MI.FirmcrawlerItem() item["firmwareName"] = filename item["publishTime"] = publishTime item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S") item["url"] = absurl item["description"] = desc item["productClass"] = "" item["productVersion"] = "" item["productModel"] = productModel item["manufacturer"] = "egreat" yield item print "firmwarename:", item["firmwareName"]
def parse_page(self, response): file_stype = response.meta['file_stype'] productModel = response.meta['productModel'] tr_list_1 = response.xpath( ".//*[@id='page-content']/div[2]/div[3]/table/tbody/tr[position()>1]" ) tr_list_2 = response.xpath( ".//*[@id='page-content']/div[2]/div[2]/table/tbody/tr[position()>1]" ) tr_list = tr_list_1 + tr_list_2 if tr_list_1: for tr in tr_list: try: href = tr.xpath("./td[1]/a/@href").extract().pop() if href.endswith('/'): request = scrapy.Request(href, meta={ 'file_stype': file_stype, 'productModel': productModel }, callback=self.parse_page) yield request else: filename = tr.xpath("./td[1]/a/text()").extract().pop() publish_Time = response.xpath( ".//*[@id='page-content']/div[2]/div[3]/table/tbody/tr[1]/td[3]/text()" ).extract() if publish_Time: publishTime = publish_Time.pop().strip().split()[0] else: publishTime = "" absurl = href item = MI.FirmcrawlerItem() item["firmwareName"] = filename item["publishTime"] = publishTime item["crawlerTime"] = time.strftime( "%Y-%m-%d %H:%M:%S") item["url"] = absurl item["description"] = "" item["productClass"] = "" item["productVersion"] = "" item["productModel"] = productModel item["manufacturer"] = "koolshare" yield item print "firmwarename:", item["firmwareName"] except Exception, e: print e.message
def parse(self, response): for i in xrange(1,7+1): #3+1 url = "http://www.sundray.com.cn/data/32_page_%s.html" %i if i == 1: url = "http://www.sundray.com.cn/data/32.html" request = scrapy.Request(url, callback=self.parse_list) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "sundray" yield request
def parse_list(self, response): tables = response.xpath('//table[@id="con_two_1"]/tr[position()>1]') for t in tables: urls = t.xpath('./td[1]/a/@href').extract() absurl = urlparse.urljoin(response.url, urls[0]) request = scrapy.Request(absurl, callback=self.parse_page) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "tomax" yield request
def parse(self, response): for page in xrange(1, 5): url_router = "http://www.wayos.com/download/luyougujian/" + str( page) + ".html" request = scrapy.Request(url_router, callback=self.parse_list) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "Wayos" yield request for page in xrange(1, 3): url_app = "http://www.wayos.com/download/APgujian/" + str( page) + ".html" request = scrapy.Request(url_app, callback=self.parse_list) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "Wayos" yield request
def parse_list(self,response): lists = response.selector.xpath('//body/table[1]//table//tr[position()>3]') for l in lists: url = l.xpath('./td[3]//a/@href').extract() absurl = urlparse.urljoin(response.url,url[0]) request = scrapy.Request(absurl, callback=self.parse_page) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "draytek" yield request
def parse(self, response): div_list = response.xpath( '//html/body/div[1]/div[2]/div/div[2]/ul/li[2]/div[position()<3]') for div_in in div_list: href_list = div_in.xpath('./div/a/@href').extract() for href in href_list: request = scrapy.Request(href, callback=self.parse_list) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "dahuatech" yield request
def parse_page(self, response): href_list = response.xpath( '//div[@id="right"]/div/div[2]/ul/li/table/tr[position()>1]/td[1]/a/@href' ).extract() for href in href_list: url = urlparse.urljoin(adslrSpider.headurl, href) request = scrapy.Request(url, callback=self.parse_list) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "adslr" yield request
def parse(self, response): # print "use ip" # iprand = random_proxy_ip() # print "random proxy:", iprand # request = scrapy.Request(response.url, callback=self.parse_page, meta={'proxy':'http://'+iprand}) request = scrapy.Request(response.url, callback=self.parse_page) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "dd-wrt" yield request
def parse(self, response): for i in range(1, 14): #13+1 url_list = "http://www.netcoretec.com/companyfile/2/%23c_companyFile_list-15086754191809347-" + str( i) # print url_list request = scrapy.Request(url_list, callback=self.parse_list) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "netcore" yield request
def parse_page(self, response): r = response.selector.xpath("//pre").re( "<a[ ]*href=\"(.*)\".*>.*</a>[ ]*(.*:.*)\r\n") # [0-9]{2} i = 0 prototype = response.meta['prototype'] while i < len(r): if r[i][-1] == "/": request = scrapy.Request(response.url + r[i], callback=self.parse_page) request.meta["prototype"] = response.meta["prototype"] yield request elif r[i].rsplit(".").pop().lower() in OpenwrtSpider.suffix: item = MI.FirmcrawlerItem(prototype) item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S") item["firmwareName"] = r[i] item["url"] = response.url + r[i] item["productModel"] = "" #divided model firmWareName = item["firmwareName"] divName1 = firmWareName.split("-") try: #aim at PandoraBox-realtek-rtl8198c-alpha-fw.bin if divName1[0] == "PandoraBox": likeModel = divName1[1] + "-" + divName1[2] elif divName1[0] == "openwrt": likeModel = divName1[1] elif divName1[1] == "openwrt": likeModel = divName1[2] else: likeModel = "" item["productModel"] = likeModel except: pass #The full firmware of openwrt are Router! item["productClass"] = "Router" try: p_s = r[i + 1].split(" ") item["publishTime"] = p_s[0] a = item["publishTime"] a = a.strip() try: array = time.strptime(a, u"%d-%b-%Y") item["publishTime"] = time.strftime("%Y-%m-%d", array) except Exception, e: print e except Exception, e: print e yield item print "firmwareName:", item["firmwareName"] else: OpenwrtSpider.allsuffix.add(r[i].rsplit(".").pop().lower()) i += 2
def parse(self, response): div_list = response.xpath('//div[@class ="view-content"]/div') for div_info in div_list: href = div_info.xpath('./div/span/div/div/div[1]/div[2]/h2/a/@href').extract() if href: absurl = href[0] filename = absurl.split("/")[-1] desc = div_info.xpath('./div/span/div/div/div[1]/div[2]/h2/a/text()').extract()[0] productModel = desc.split(" ")[0] publish_Time = div_info.xpath('./div/span/div/div/div[1]/div[3]/p/text()').extract() if publish_Time: publishTime = publish_Time[0].strip() else: publishTime = "" elif div_info.xpath('./div/span/div/div/div[1]/div[1]/h2/a/@href').extract(): href = div_info.xpath('./div/span/div/div/div[1]/div[1]/h2/a/@href').extract() absurl = href[0] filename = absurl.split("/")[-1] desc = div_info.xpath('./div/span/div/div/div[1]/div[1]/h2/a/text()').extract()[0] productModel = desc.split(" ")[0] publish_Time = div_info.xpath('./div/span/div/div/div[1]/div[2]/p/text()').extract() if publish_Time: publishTime = publish_Time[0].strip() else: publishTime = "" else: productModel= "" absurl = "" desc = "" publishTime = "" filename = "" # print absurl # print filename # print publishTime # print desc # print productModel item = MI.FirmcrawlerItem() item["productVersion"] = "" item["productClass"] = "" item["productModel"] = productModel item["description"] = desc item["url"] = absurl item["firmwareName"] = filename item["publishTime"] = publishTime item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S") item["manufacturer"] = "u-blox" yield item print "firmwarename:", item["firmwareName"]
def parse_list(self, response): prototype = response.meta['prototype'] href_list = response.xpath( '//div[@id="main-content"]/div[2]/div[2]/div[2]/ul/li/div/section/a/@href' ).extract() for href in href_list: url = urlparse.urljoin(schnerderSpider.headurl, href) request = scrapy.Request(url, callback=self.parse_page) request.meta["prototype"] = MI.FirmcrawlerItem() request.meta["prototype"]["manufacturer"] = "schneider" yield request
def parse_page(self, response): prototype = response.meta['prototype'] item = MI.FirmcrawlerItem(prototype) filename = response.xpath( '//div[@class="technical_support_box_z"]/div/div/text()').extract( ) if filename: filename = filename[0] else: filename = "" publishTime = response.xpath( '//div[@class="technical_support_box_z"]/div/ul/li[2]/text()' ).extract()[0] publishTime = publishTime.strip().split(" ")[0] absurl = response.xpath( '//div[@class="technical_support_box_z_info_box"]/div[5]/ul/li/a/@href' ).extract() if absurl: absurl = absurl[0] # print absurl else: absurl = "" desc_li = response.xpath( '//div[@class="technical_support_box_z_info_box"]/div[3]/ul/li') desc = [] for desc_info in desc_li: desc_ = desc_info.xpath('./font/text()').extract() if desc_: description = desc_[0] # print description desc.append(description) else: description = "" if desc: desc = " ".join(desc) else: desc = "" item["productVersion"] = "" item["publishTime"] = publishTime item["productClass"] = "" item["productModel"] = "" item["description"] = desc item["url"] = absurl item["firmwareName"] = filename item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S") item["manufacturer"] = "jcg" yield item print "firmwarename:", item["firmwareName"]
def parse_page(self, response): li_list = response.xpath( "//html/body/div[1]/div[3]/div[1]/div[2]/div/ul/li") for li in li_list: filename = li.xpath("./h3/a/text()").extract().pop().strip() href = li.xpath("./h3/a/@href").extract().pop() absurl = urlparse.urljoin(self.headurl, href) description = li.xpath("./p[2]/text()").extract() if description: desc = description.pop() else: desc = "" product_Model = desc.split(" ")[0] if product_Model: productModel = product_Model else: productModel = "" publish_Time = desc.split(" ")[-1] if publish_Time: publish_Time_ = re.search("\d.+.\d", publish_Time) if publish_Time_: publishTime = publish_Time_.group() else: publishTime = "" # print publishTime version_info = re.search("V.*", desc) if version_info: version = version_info.group() productVersion = version.split(" ")[0] else: productVersion = "" item = MI.FirmcrawlerItem() item["firmwareName"] = filename item["publishTime"] = publishTime item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S") item["url"] = absurl item["description"] = "" item["productClass"] = "" item["productVersion"] = productVersion item["productModel"] = productModel item["manufacturer"] = "newgreennet" yield item print "firmwarename:", item["firmwareName"]