def parser_sub(self,response): content = response.body # 乱码处理 # for i in range(100): # try: # new_content = unicode(content, 'gbk') # break # except Exception, e: # if 'position' in str(e): # error_str = crawlerTool.getRegex('position\s+(\d+-\d+)', str(e)) # start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1 # content = content[:start_index] + content[end_index:] response_content = content print response.url url= response.url cfemail = crawlerTool.getXpath('//a[@class="__cf_email__"]/@data-cfemail', content) title = crawlerTool.getXpath('//title/text()', content)[0] mail = '' if cfemail: mail = self.get_mail(cfemail[0]) data_obj = CmocroItem() data_obj['url'] = url data_obj['mail'] = mail data_obj['name'] = title.replace('- CMOCRO','') # print lxr,dz,yb,dh,sj yield data_obj
def parse1(self, response): base_url = get_base_url(response) response_content = response.body # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多 cat_name = response.meta.get('cat_name') segs = crawlerTool.getXpath('//div[@class="product_list_left_in"]//li', response_content) for seg in segs: ChemicalName,CASNumber,MolFormula,SearchImg,Synonyms,url = ['' for i in range(6)] SearchImg = crawlerTool.getXpath1('//div[@class="leftSearchImg"]/a/img/@src', seg) SearchImg = 'https://www.trc-canada.com' + SearchImg contents = crawlerTool.getXpath('//div[@class="ContentDesc"]', seg) for content in contents: content=content.replace('\r','').replace('\n','') if 'Chemical Name:' in content: ChemicalName = crawlerTool.getRegex('</label>(.*?)<',content).strip() elif 'CAS number:' in content: CASNumber = crawlerTool.getRegex('</label>(.*?)<', content).strip() elif 'Mol. Formula:' in content: MolFormula = crawlerTool.getRegex('</label>(.*?)<', content).strip() elif 'Synonyms' in content: Synonyms = crawlerTool.getRegex('</label>(.*?)<', content).strip() # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg) data_obj = Trc_Item() data_obj['ChemicalName'] = ChemicalName data_obj['CASNumber'] = CASNumber data_obj['MolFormula'] = MolFormula data_obj['SearchImg'] = SearchImg data_obj['Synonyms'] = Synonyms data_obj['api_name'] = cat_name data_obj['url'] = SearchImg yield data_obj
def parse(self, response): base_url = get_base_url(response) response_content = response.body # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多 suburls = crawlerTool.getXpath( "//table[@id='ContentPlaceHolder1_ProductClassDetail']//tr/td[1]/a/@href", response_content) for suburl in suburls: suburl = urljoin(base_url, suburl) yield scrapy.Request(url=suburl, callback=self.parser_sub) next_page_url = crawlerTool.getXpath('//div[@align="center"]/a/@href', response_content) if next_page_url: for page_url in next_page_url: # 重试 page_url = urljoin(base_url, page_url) yield scrapy.Request(url=page_url, callback=self.parse)
def parse(self, response): base_url = get_base_url(response) response_content = response.body # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多 cat_name = response.meta.get('cat_name') segs = crawlerTool.getXpath('//doc', response_content, xml_type='XML') for seg in segs: name = crawlerTool.getXpath1('//str[@name="name"]/text()', seg) cas = crawlerTool.getXpath1('//str[@name="casNumber"]/text()', seg) function = crawlerTool.getXpath1('//str[@name="tagline"]/text()', seg) # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg) data_obj = Caymanchem() data_obj['name'] = name data_obj['cas'] = cas data_obj['function'] = function data_obj['cat'] = cat_name data_obj['url'] = name + cat_name + cas yield data_obj totalnum = int( crawlerTool.getXpath1('//result[@name="response"]//@numFound', response_content, xml_type='XML')) if not response.meta.get('depth'): print totalnum for i in range(1, totalnum, 1): url = 'https://www.caymanchem.com/solr/cchProduct/select?facet=true&facet.field=raptas'+\ '&facet.field=newProduct&facet.limit=100000&fl=isEUSellable%2Cname%2CmarkupName%2CcatalogNum%2CproductImage%2Csynonyms%2CcasNumber%2Ctagline%2Cscore%2CitemGroupId%2CprimaryVendorId&spellcheck=true&spellcheck.collate=true&spellcheck.count=10&spellcheck.extendedResults=true&spellcheck.onlyMorePopular=false&facet.mincount=1&rows=10&version=2.2&json.nl=map&'+\ 'q=*%3A*&start='+str(i)+'&fq=('+cats[cat_name]+')AND(!raptas%3ARAP000101%20AND%20websiteNotSearchable%3Afalse)' yield scrapy.Request(url, callback=self.parse, meta={ 'cat_name': cat_name, 'depth': 1 })
class HxChemSpider(scrapy.Spider): name = "hxchem" # 唯一标识 # allowed_domains = ["csdn.net"] start_urls = [ "http://www.hxchem.net/company.php?page=%s"%str(i) for i in range(200,3160,1) # ] #def start_requests(self): #测试cookie # # 带着cookie向网站服务器发请求,表明我们是一个已登录的用户 # yield scrapy.Request(self.start_urls[0], callback=self.parse, cookies={'meng':1}) def parse(self, response): base_url = get_base_url(response) content = response.body # 乱码处理 for i in range(100): try: new_content = unicode(content, 'gbk') break except Exception, e: if 'position' in str(e): error_str = crawlerTool.getRegex('position\s+(\d+-\d+)', str(e)) start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1 content = content[:start_index] + content[end_index:] response_content = new_content suburls = crawlerTool.getXpath("//div[@class='ad_content']//dl/dt/a/@href",response_content) if len(suburls)<10: print('num error',response.url) for suburl in suburls: suburl = urljoin(base_url,suburl) yield scrapy.Request(url=suburl,callback = self.parser_sub)
def parse(self, response): base_url = get_base_url(response) content = response.body # 乱码处理 manufacturers = crawlerTool.getXpath("//h2/a/@href", content) for manufacturer in manufacturers: sub_url = 'https://www.parkers.co.uk' + manufacturer + 'specs/' yield scrapy.Request(url=sub_url, callback=self.parser_sub)
def parser_spec_url(self, response): content = response.body url = response.url FullSpecs_urls = crawlerTool.getXpath('//h3/a/@href', content) for spec_url in FullSpecs_urls: if not 'http' in spec_url: spec_url = 'https://www.parkers.co.uk' + spec_url yield scrapy.Request(url=spec_url, callback=self.parser_detail)
def keyword_search(keyword): keywords = urllib.quote(keyword) url = 'https://www.youtube.com/results?search_query=' + keywords page = ct.get(url) imgurl0 = ct.getXpath('//div[@id="img-preload"]/img/@src', page)[0] vid = ct.getRegex('i.ytimg.com/vi/(.*?)/', imgurl0) video_url = 'https://www.youtube.com/watch?v=' + vid print video_url return video_url, imgurl0
def parse(self, response): base_url = get_base_url(response) url_now = response.url response_content = response.body # 乱码处理 segs = crawlerTool.getXpath('//div[@class="cas_default_list_star "]//ul', response_content) for seg in segs[1:-1]: data_obj = SeekchemItem() lis=crawlerTool.getXpath('//li',seg) data_obj['url'] = crawlerTool.getXpath1('//a/@href',lis[0]) data_obj['cas'] = crawlerTool.getXpath1('//b/text()',lis[0]) data_obj['name'] = crawlerTool.getXpath1('//text()',lis[1]) yield data_obj # next_page = crawlerTool.getXpath1("//a[@class='next']/@href", response_content) # next_page_url = urljoin(url_now,next_page) # yield scrapy.Request(url=next_page_url, callback=self.parse) page_urls = crawlerTool.getXpath( '//div[@class="pages"]/a/@href', response_content) for page_url in page_urls: yield scrapy.Request(urljoin(url_now,page_url), callback=self.parse)
def parse(self, response): base_url = get_base_url(response) response_content = response.body # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多 # cat_name = response.meta.get('cat_name') segs = crawlerTool.getXpath('//li[@class="list-group-item"]/text()', response_content) for seg in segs: data_obj = angenechemical_item() data_obj['url'] = seg yield data_obj
def parser_sub(self, response): base_url = get_base_url(response) response_content = response.body # 乱码处理 url = response.url #detail =crawlerTool.getXpath('//div[@id="ContentPlaceHolder1_SupplierContact"]',response_content)[0] # 关于我们 # response_content = unicode(response_content, 'gbk') # http://www.hxchem.net/companydetaildesenborn.html 这个就不行了! # lxwm = HTMLParser().unescape(lxwm) # lxwm=lxwm.encode('utf8') data_obj = ChemicalBook() data_obj['url'] = url data_obj['name'] = crawlerTool.getXpath( '//div[@id="ContentPlaceHolder1_SupplierContact"]/table[2]//tr[2]/td[2]/a/text()', response_content)[0] data_obj['lxdh'] = crawlerTool.getXpath1( '//div[@id="ContentPlaceHolder1_SupplierContact"]/table[2]//tr[3]/td[2]//text()', response_content) data_obj['email'] = crawlerTool.getXpath1( '//div[@id="ContentPlaceHolder1_SupplierContact"]/table[2]//tr[5]/td[2]//text()', response_content) data_obj['wz'] = crawlerTool.getXpath1( '//div[@id="ContentPlaceHolder1_SupplierContact"]/table[2]//tr[6]/td[2]//text()', response_content) cplb_div = crawlerTool.getXpath( '//div[@id="ContentPlaceHolder1_ProductSupplier"]//table', response_content)[3:-1] print data_obj['name'].encode('unicode-escape').decode('string_escape') cplb = [] for cp in cplb_div: chinese_name = crawlerTool.getXpath('//tr/td[2]/text()', cp) chinese_name = chinese_name[0] if chinese_name else '' cps = crawlerTool.getXpath('//tr/td[3]/text()', cp) cps = cps[0] if cps else '' cplb.append(' '.join([chinese_name, cps])) data_obj['cplb'] = cplb # print lxr,dz,yb,dh,sj yield data_obj page_urls = crawlerTool.getXpath( '//div[@id="ContentPlaceHolder1_ProductSupplier"]//table[2]//tr[2]/td[2]//a/@href', response_content) for page_url in page_urls: page_url = urljoin(base_url, page_url) yield scrapy.Request(url=page_url, callback=self.parser_sub)
def parse(self, response): base_url = get_base_url(response) response_content = response.body # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多 cat_name = response.meta.get('cat_name') segs = crawlerTool.getXpath('//table[@id="product-list"]/tbody/tr', response_content) for seg in segs: name, MolecularFormula, MolecularWeight, image, cas, url = [ '' for i in range(6) ] SearchImg = crawlerTool.getXpath1( '//img[@class="dg-picture-zoom acc_img_container acc_zoomer"]/@src', seg) contents = crawlerTool.getXpath('//table//tr', seg) for content in contents: content = content.replace('\r', '').replace('\n', '') if 'Name' in content: name = crawlerTool.getXpath1('//td[2]', content) name = crawlerTool.getRegex('>(.*?)<', name).strip() elif 'CAS No' in content: cas = crawlerTool.getXpath1('//td[2]', content) cas = crawlerTool.getRegex('>(.*?)<', cas).strip() elif 'Molecular Formula' in content: MolecularFormula = crawlerTool.getXpath1( '//td[2]', content) MolecularFormula = re.sub('<.*?>', '', MolecularFormula).strip() elif 'Molecular Weight' in content: MolecularWeight = crawlerTool.getXpath1('//td[2]', content) MolecularWeight = crawlerTool.getRegex( '>(.*?)<', MolecularWeight).strip() # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg) data_obj = acccorporation_Item() data_obj['url'] = name data_obj['name'] = name data_obj['MolecularFormula'] = MolecularFormula data_obj['MolecularWeight'] = MolecularWeight data_obj['image'] = SearchImg data_obj['cas'] = cas yield data_obj
def parse(self, response): response_content = response.body cats = crawlerTool.getXpath('//input[@type="checkbox"]/@value',response_content) print len(cats) # cats = ['Ether'] for cat in cats: first_str = cat[0].lower() # if first_str in ('a','b','c'):continue yield scrapy.FormRequest(url='https://www.trc-canada.com/parentdrug-listing/', formdata={"keyword" : " %s "%cat, "t" : "product","advanced":"yes"}, callback=self.parse1,meta = {'cat_name':cat})
def parser_detail(self, response): content = response.body url = response.url data_obj = ParkersItem() data_obj['title'] = crawlerTool.getXpath('//title/text()', content)[0] data_obj['url'] = url # url 中提取名称和model urlsplit = url.split('/') if len(urlsplit) > 4: data_obj['name'] = urlsplit[3] data_obj['model'] = urlsplit[4] data_obj['power'] = crawlerTool.getRegex('Power</th><td>(.*?)</td>', content) data_obj['TopSpeed'] = crawlerTool.getRegex( 'Top Speed</th><td>(.*?)</td>', content) data_obj['zerotosixty'] = crawlerTool.getRegex( '<th>0-60 mph</th><td>(.*?)</td>', content) data_obj['Torque'] = crawlerTool.getRegex( '<th>Torque</th><td>(.*?)</td>', content) data_obj['co2Emissions'] = crawlerTool.getRegex( '<th>CO<sub>2</sub> Emissions</th><td>(.*?)</td>', content) data_obj['EuroEmissionsStandard'] = crawlerTool.getRegex( '<th>Euro Emissions Standard</th><td>(.*?)</td>', content) data_obj['Fuelconsumption'] = crawlerTool.getRegex( '<tr><th>Fuel consumption</th><td>(.*?)</td>', content) data_obj['Length'] = crawlerTool.getRegex( '<tr><th>Length</th><td>(.*?)</td>', content) data_obj['Width'] = crawlerTool.getRegex( '<tr><th>Width</th><td>(.*?)</td>', content) data_obj['Height'] = crawlerTool.getRegex( '<tr><th>Height</th><td>(.*?)</td>', content) data_obj['EngineSize'] = crawlerTool.getRegex( '<tr><th>Engine Size</th><td>(.*?)</td>', content) data_obj['Cylinders'] = crawlerTool.getRegex( '<tr><th>Cylinders</th><td>(.*?)</td>', content) data_obj['FuelType'] = crawlerTool.getRegex( '<tr><th>Fuel Type</th><td>(.*?)</td>', content) data_obj['Transmission'] = crawlerTool.getRegex( '<tr><th>Transmission</th><td>(.*?)</td>', content) data_obj['Doors'] = crawlerTool.getRegex( '<tr><th>Doors</th><td>(.*?)</td>', content) data_obj['Seats'] = crawlerTool.getRegex( '<tr><th>Seats</th><td>(.*?)</td>', content) data_obj['taxcostBasic'] = crawlerTool.getRegex( '<tr><th>Monthly company car tax cost \(Basic Rate\)</th><td>(.*?)</td>', content).replace('£', '£') # £ 是英镑 # print lxr,dz,yb,dh,sj yield data_obj
def parse(self, response): base_url = get_base_url(response) content = response.body # 乱码处理 segs = crawlerTool.getXpath( "//table//td[2]//td/table[2]//td//table//tr//td//tr", content) for seg in segs[1:]: tds = crawlerTool.getXpath("//td", seg) if len(tds) < 4: continue cat_no = tds[0] product_name = tds[1] cas = tds[2] assay = tds[3] rovathin_item = RovathinItem() rovathin_item['cat_no'] = re.sub('\s*<.*?>\s*', '', cat_no) rovathin_item['product_name'] = re.sub('\s*<.*?>\s*', '', product_name) rovathin_item['cas'] = re.sub('\s*<.*?>\s*', '', cas) rovathin_item['assay'] = re.sub('\s*<.*?>\s*', '', assay) rovathin_item['url'] = crawlerTool.getXpath1( "//a/@href", product_name) yield rovathin_item
class CmocroSpider(scrapy.Spider): name = "cmocro" # 唯一标识 # allowed_domains = ["csdn.net"] start_urls = [ # ] db_connect = MysqlPipeline3() url_cache = [] # 这里用一下内存去重 for i in xrange(26): c1 = chr(i + ord('a')) for i in xrange(26): c2 = chr(i + ord('a')) start_urls .append("https://www.cmocro.com/company_search.php?company=%s%s" % (c1,c2)) # start_urls = start_urls[:1] #def start_requests(self): #测试cookie # # 带着cookie向网站服务器发请求,表明我们是一个已登录的用户 # yield scrapy.Request(self.start_urls[0], callback=self.parse, cookies={'meng':1}) def parse(self, response): base_url = get_base_url(response) content = response.body # 乱码处理 for i in range(100): try: new_content = unicode(content, 'utf8') break except Exception, e: if 'position' in str(e): print str(e) error_str = crawlerTool.getRegex('position\s+(\d+-\d+)', str(e)) if '-' in str(e): start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1 content = content[:start_index] + content[end_index:] else: start_index = int(crawlerTool.getRegex('position (\d+)',str(e))) content = content[:start_index] + content[start_index+1:] response_content = new_content suburls = crawlerTool.getXpath('//div[@class="company_list"]/a/@href',response_content) for suburl in suburls: suburl = urljoin(base_url, suburl) if not self.db_connect.get_by_unique_value(suburl): yield scrapy.Request(url=suburl,callback = self.parser_sub)
def extractor_page2(page, code): # 解析详情页 # 买入前5名 content_table = crawlerTool.getXpath('//table', page, charset='gbk')[0] trs = crawlerTool.getXpath('//tr', content_table, charset='gbk') rows = [[], [u'股票代码', code]] for tr in trs: row = [] for td in crawlerTool.getXpath('//th', tr, charset='gbk'): row.append(re.sub('(<.*?>)', "", td).strip()) for td in crawlerTool.getXpath('//td', tr, charset='gbk'): row.append(re.sub('(<.*?>)', "", td).strip()) rows.append(row) #卖出前5名 content_table = crawlerTool.getXpath('//table', page, charset='gbk')[1] trs = crawlerTool.getXpath('//tr', content_table, charset='gbk') for tr in trs: row = [] for td in crawlerTool.getXpath('//th', tr, charset='gbk'): row.append(re.sub('(<.*?>)', "", td).strip()) for td in crawlerTool.getXpath('//td', tr, charset='gbk'): row.append(re.sub('(<.*?>)', "", td).strip()) rows.append(row) return rows
def parser_sub(self,response): content = response.body # 乱码处理 for i in range(100): try: new_content = unicode(content, 'gbk') break except Exception, e: if 'position' in str(e): error_str = crawlerTool.getRegex('position\s+(\d+-\d+)', str(e)) start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1 content = content[:start_index] + content[end_index:] response_content = new_content print response.url url= response.url gywm =crawlerTool.getXpath("//td[@class='goscill22']/table[2]//p/text()",response_content) # 关于我们 gywm = ''.join(gywm).replace('\n','').replace('\r','') # response_content = unicode(response_content, 'gbk') # http://www.hxchem.net/companydetaildesenborn.html 这个就不行了! lxwm = crawlerTool.getXpath("//td[@class='goscill22']/table[4]",response_content) # 联系我们 lxwm = lxwm[0] # lxwm = HTMLParser().unescape(lxwm) # lxwm=lxwm.encode('utf8') data_obj = HxchemItem() data_obj['url'] = url data_obj['gywm'] = gywm data_obj['name'] = crawlerTool.getXpath("//h1/text()",response_content)[0] data_obj['lxr'] = crawlerTool.getRegex('联系人:(.*?)<',lxwm) data_obj['dz'] = crawlerTool.getRegex('地 址:(.*?)<',lxwm) data_obj['yb'] = crawlerTool.getRegex('邮 编:(.*?)<',lxwm) data_obj['dh'] = crawlerTool.getRegex('电 话:(.*?)<',lxwm) data_obj['sj'] = crawlerTool.getRegex('手 机:(.*?)<',lxwm)