def parseAlbumPage(self, response): self._logger.debug("Parsing Album Page at url %s ", response.url) artist_name = response.meta['artist_name'] album_name = response.meta['album_name'] self._logger.debug("Parsing Album %s for Artist %s", album_name, artist_name) hxs = Selector(response) album = AlbumScraperItem() album['item_type'] = 'Album' album['album_name'] = album_name album['artist_name'] = artist_name album_label = hxs.xpath('/html/body/table[2]/tr/td[2]/table/tr[6]/td[2]/text()').extract() if (len(album_label) > 0): album['album_label'] = album_label[0] album_title = hxs.xpath('/html/body/table[2]/tr/td[2]/table/tr[2]/td[2]/text()').extract() if (len(album_title) > 0): print album_title[0] released_year = hxs.xpath('/html/body/table[2]/tr/td[2]/table/tr[3]/td[2]/text()').extract() if (len(released_year) > 0): album['album_year'] = released_year[0] album_styles = hxs.xpath('/html/body/table[2]/tr/td[2]/table/tr[5]/td[2]/ul/li') style_list = [] for album_style in album_styles: style = album_style.xpath('./text()').extract() if (len(style) > 0): style_list.append(style[0]) album['album_styles'] = ','.join(style_list) yield album
def parse_review(self, response): hxs = Selector(response) asin = response.meta['asin'] title = FmtSQLCharater(first_item(hxs.xpath('//title/text()').extract())) title = title.replace(u'Amazon.com: Customer Reviews: ', '') rlist = hxs.xpath("//div[@id='cm_cr-review_list']/div[@class='a-section review']") for div in rlist: r = Review() r['product_id'] = asin r['product_name'] = title r['review_id'] = first_item(div.xpath('@id').extract()) votes = FmtSQLCharater(first_item(div.xpath('div[1]/span/text()').extract())) match = re.search(u'(.+) people found this helpful', votes, re.I) if match: r['total_feedback_num'] = match.group(1) r['total_helpful_num'] = match.group(2) # r['full_star'] = FmtSQLCharater(first_item(div.xpath("div[2]/a[1]/i/span/text()").extract())) r['title'] = FmtSQLCharater(first_item(div.xpath("div[2]/a[2]/text()").extract())) r['cust_name'] = FmtSQLCharater(first_item(div.xpath("div[3]/span[1]/a/text()").extract())) r['creation_date'] = FmtSQLCharater(first_item(div.xpath("div[3]/span[4]/text()").extract())) #r['creation_date'] = r['creation_date'].replace(u'于 ', '').replace('年', '/').replace(u'月', '/').replace(u'日', '/') r['body'] = first_item(div.xpath("div[5]/span").extract()) yield r #下一页 if len(rlist) == 10: page = response.meta['page'] + 1 log.msg('Request Product[%s]-[%d] page review ...' % (asin, page)) yield Request( url=self.review_url.replace('<?asin?>', asin).replace('<?page?>', str(page)), callback=self.parse_review, headers=self.headers, meta={'page': page, 'asin': asin} )
def parse_item(self, response): #self.log('AshfordSpider#parse_item...') self._logger.info('AshfordSpider#parse_item...') item = AshfordItem() sel = Selector(response) self._enrich_base_data(item, response, is_update=False) self._enrich_same_part(item, response) item['prodName'] = ''.join(sel.xpath(' //*[@id="prodName"]/a/text()').extract()).strip() item['prod_desc'] = (''.join(sel.xpath('//*[@id="fstCont"]/h3/text()').extract()).strip()) item['detail'] = format_html_string(''.join(sel.xpath('//div[@id="tab1_info"]').extract()).strip()) item['Brand'] = ''.join(sel.xpath('//h1[@id="prodName"]/a[@id="sameBrandProduct"]/text()[1]').extract()).strip() item['product_images'] = list(set(sel.xpath('//a[contains(@href,"/images/catalog/") and contains(@href,"XA.jpg")]/@href').extract())) item['image_urls'] = [urljoin(response.url, i) for i in item['product_images']] chinese_url = response.url.replace('www.', 'zh.') response.meta['item_half'] = item self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) yield Request( url=chinese_url, meta=response.meta, callback=self.parse_chinese_detail, dont_filter=True )
def load_annotations(body): """Create slybot annotations from annotated html.""" if not body: return {"annotations-plugin": {"extracts": []}} sel = Selector(text=add_tagids(body)) existing_ids = set() annotations = [] for elem in sel.xpath("//*[@data-scrapy-annotate]"): attributes = elem.root.attrib annotation = json.loads(unquote(attributes["data-scrapy-annotate"])) if isinstance(elem.root, _Element) and elem.root.tag.lower() == "ins": annotation.update(find_generated_annotation(elem)) else: annotation["tagid"] = attributes.get("data-tagid") if "id" not in annotation: annotation["id"] = gen_id(disallow=existing_ids) existing_ids.add(annotation["id"]) annotations.append(annotation) for elem in sel.xpath("//*[@%s]" % "|@".join(IGNORE_ATTRIBUTES)): attributes = elem.root.attrib for attribute in IGNORE_ATTRIBUTES: if attribute in attributes: break ignore = {attribute[len("data-scrapy-") :]: True} if "id" not in ignore: ignore["id"] = gen_id(disallow=existing_ids) existing_ids.add(ignore["id"]) annotations.append(ignore) return {"annotations-plugin": {"extracts": annotations}}
def stockholder(self,response): hxs = Selector(response) data = response.meta['data'] data['type'] = 'StockHolder' data['reference'] = response.url date_str = ''.join(hxs.xpath('//tr[position() =1]/td[1]/text()').extract()).replace(u'(Ultimo Período Informado)','').strip() #06 / 2015 try: date_struct = time.strptime(date_str,'%m / %Y') data['stock_date'] = time.strftime('%Y-%m-%dT%H:%M:%SZ',date_struct) except: self.log('No Information: %s' % response.url) stock_list = hxs.xpath('//tr[position() >1]') for item in stock_list: data['type'] = 'StockHolder' data['stock_name'] = ''.join(item.xpath('td[1]/text()').extract()).strip() data['stock_percentage'] = ''.join(item.xpath('td[4]/text()').extract()).strip() if data['stock_name'] == '': data['type'] = 'Company' m = hashlib.md5() to_hash = data['type'] + data['company_ICN'] + data['company_name'] + \ data['stock_name'] + data['stock_percentage'] m.update(to_hash.encode('utf-8')) data['id'] = m.hexdigest() yield data
def xt(cls, response): XPATH_BI_creator = cls.XPATH.format("Erstunterzeichner") XPATH_PET_creator = cls.XPATH.format("eine Petition") creators = [] raw_creators_list = response.xpath(XPATH_PET_creator).extract() if len(raw_creators_list) > 0: # PET started by members of parliament for raw_creator in raw_creators_list: creator_sel = Selector(text=raw_creator) raw_parl_id_url = creator_sel.xpath("//a/@href").extract() name = u'' parl_id = u'' if len(raw_parl_id_url) > 0: raw_parl_id = raw_parl_id_url[0].split("/") if len(raw_parl_id) > 1: parl_id = raw_parl_id[2] raw_name = creator_sel.xpath("//a/text()").extract() if len(raw_name) > 0: name = raw_name[0] if parl_id != u'' and name != u'': creators.append((parl_id, name)) else: raw_creators_list = response.xpath(XPATH_BI_creator).extract() if len(raw_creators_list) > 0: # BI first signed by a person name = _clean(raw_creators_list[0].split("\t")[1]) creators.append(("", name)) # VBG seem to have no visible "starter" return creators
def parse_business_page(self, response): # # Set up xpaths for populating item entries # hxs = Selector(response) contact = hxs.xpath('//*[@id="main-content"]/div[1]/div[1]/div/section[2]/div[1]') bNameXPath_list = hxs.xpath('//*[@id="main-content"]/div[1]/div[1]/h1/text()').extract() bStreetXPath_list = contact.xpath('./p[@class="street-address"]/text()').extract() bCityState_list = contact.xpath('./p[@class="city-state"]/text()').extract() bPhone_list = contact.xpath('./p[@class="phone"]/text()').extract() # # Grab specific business fields # businessItem = YellowpagesItem() businessItem['Name'] = bNameXPath_list[0] if bNameXPath_list else '' businessItem['Street'] = bStreetXPath_list[0] if bStreetXPath_list else '' businessItem['Phone'] = bPhone_list[0] if bPhone_list else '' if bCityState_list: city_state_string = bCityState_list[0] businessItem['City'], businessItem['State'], businessItem['Postal'] = city_state_string.split() businessItem['City'] = businessItem['City'].strip(',') businessItem['Postal'] = int(businessItem['Postal']) businessItem['Street'] = businessItem['Street'].strip(',') else: city_state_string = '' return businessItem
def extract_combine_JCXX(response): """ 提取监管信息(如http://00225516.11315.com/) """ sel = Selector(text=response.body) xpath_result = [response.url[7:15]] #存xpath提取出来的信息,[response.url[7:15]]为公司的id keywords1 = ['企业法人营业执照', '组织机构代码','税务登记证','银行开户许可证', '第三方征信认证'] xpath_syn1 = [u"//a[text()='%s']/@href"%i for i in keywords1] for i in xpath_syn1: #提取企业法人营业执照,组织机构 tmp = sel.xpath(i).extract() if len(tmp) == 0: xpath_result.append("") elif len(tmp) == 1 &('java' not in tmp[0]): #只一元素且不是"javascript:void(0);" xpath_result.append("http://00225516.11315.com"+tmp[0].strip()) else: log.msg("error for_JCXX xpath_syn1 xpath_result=%s\ "%"\001".join(xpath_result), level=log.ERROR) keywords = ['1-2.质量检查信息','1-3.行政许可资质',\ '1-4.行政监管信息','1-5.商标/专利/著作权信息',\ '1-6.人民法院的判决信息','1-7.人民法院判定的被执行人信息',\ '1-8.人民法院核定的失信被执行人信息'] # xpath_syn = [u"//a[text()='%s']/ancestor::div[@class=\ # 'f-cb bdsolid']/div"%i for i in keywords] #xpath语句 xpath_syn = [u"//a[text()='%s']/../../div"%i for i in keywords] #xpath语句 for i in xpath_syn: raw_re = sel.xpath(i) check_total = raw_re.xpath("./a[1]/text()").extract() #查看是否有信息,如是check_total为空,说明没有信息 check_a = raw_re.xpath("./a") #查看总信息是否有分类 ,如果a标签的数量大于1则说明有分类 if len(check_total) == 0: #如果总信息都为0,则直接返回0|0....... xpath_result.append("|".join(["0" for i in xrange(0,len(check_a))])+"|") #最后一个"|"是用来隔开url elif len(check_a) == 1: #如果只有总信息,则直接将信息个数和url拼接 all_JCXX = check_total[0].strip() all_JCXX_url = check_a.xpath("./@href").extract()[0] all_JCXX_url = "http://00225516.11315.com" + all_JCXX_url xpath_result.append(str(all_JCXX) + "|" + all_JCXX_url) else: #如果有总信息,且下面有子信息,则迭代将子信息提取出来 info_me = [] all_JCXX_url = check_a.xpath("./@href").extract()[0] all_JCXX_url = "http://00225516.11315.com" + all_JCXX_url for i in xrange(1, len(check_a)+1): s = "./a[%s]/text()" %str(i) ex_out = raw_re.xpath(s).extract() if len(ex_out) == 0: info_me.append("0") #如是a节点没有元素,则加入"" else: info_me.append(ex_out[0].strip()) xpath_result.append("|".join(info_me) + "|" + all_JCXX_url) return "\001".join(xpath_result) #将个元素用"\001"结合
def parse_user_page(self, response): html = response.body # f=open('E:/PyCharm/CatPackages/resources/doc/user.html','w') # f.write(response.body) # f.close() html = html.decode('utf-8') sel = Selector(text=html) uid = sel.xpath('//script').re("\$CONFIG\['oid'\]='(\d+?)';")[0] pid = sel.xpath('//script').re("\$CONFIG\['page_id'\]='(\d+?)';")[0] name = sel.xpath('//script').re("\$CONFIG\['onick'\]='(.*?)';")[0] print(uid) print(pid) print(name) # import pdb # pdb.set_trace() # (follow_num, fan_num, post_num) = sel.xpath('//script').re("<strong.*?>(\d+)<.*?/strong>") # follow_num = int(follow_num) # fan_num = int(fan_num) # post_num = int(post_num) # verify = sel.xpath('//script').re("W_icon icon_verify_(v|club)") is not None yield UserItem(uid=uid, pid=pid, name=name)
def parse_detail(self,response): item = CrawldetailsItem() sel = Selector(response) try: item["kd"] = response.meta['kd'] item["title"] = self.get_text(sel,'//*[@id="job_detail"]/dt/h1/@title') item["company"] = sel.xpath('//*[@id="container"]/div[2]/dl/dt/a/div/h2/text()').extract()[0].strip() item["city"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[2]/text()').extract()[0] item["address"] = sel.xpath('//*[@id="container"]/div[2]/dl/dd/div[1]/text()').extract()[0] industry = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[1]').extract()[0] item["industry"] = BeautifulSoup(industry).get_text().encode("utf-8").split(' ')[1].strip() scale = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[2]').extract()[0] item["scale"] = BeautifulSoup(scale).get_text().encode("utf-8").split(' ')[1].strip() phase = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[2]/li').extract()[0] item["phase"] = BeautifulSoup(phase).get_text().encode("utf-8").split(' ')[1].strip() item["salary"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[1]/text()').extract()[0] item["experience"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[3]/text()').extract()[0] item["education"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[4]/text()').extract()[0] item["description"] = self.get_text(sel,'//*[@id="job_detail"]/dd[2]') item["url"] = response.url item["published"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[3]/text()').extract()[0][:-8] item["tag"] = self.get_text(sel, '//*[@id="job_detail"]/dd[1]/p[2]/text()') except Exception, e: print e
def parse(self, response): hxs = Selector(response) item = MartinScrapperItem() try: titles = hxs.xpath( '//div[@class="wpb_wrapper"]//div[@class="wpb_wrapper"]/text()' ).extract() print(titles) clean_titles = [ ct for ct in [ re.sub(r'\s+',' ',title).strip() for title in titles ] if ct ] info_list = hxs.xpath( '//div[@class="wpb_wrapper"]//div[@class="wpb_wrapper"]/p/text()' ).extract() clean_info_list = [ ct for ct in [ re.sub(r'\s+',' ',info).strip() for info in info_list ] if ct ] print(clean_info_list) #print(info) except KeyError: self.log('Unable to find title', level=log.WARNING) return item
def parse_homepage(self, response): sel = Selector(response) def func(node, hot): country_url = node.xpath('./@href').extract()[0].strip() country_name = node.xpath('./text()').extract()[0].strip() ret = node.xpath('./span[@class="en"]/text()').extract() country_engname = ret[0].lower().strip() if ret else None if 'country' in self.param and country_engname.lower() not in self.param['country']: return None sights_url = urlparse.urljoin(country_url, './sight') m = {"country_name": country_name, "country_url": country_url, "country_popular": hot, "country_engname": country_engname, "sights_url": sights_url} return Request(url=sights_url, callback=self.parse_countrysights, meta={"country": m}) for req in map(lambda node: func(node, False), sel.xpath('//div[@id="allcitylist"]/div[contains(@class,"line")]/ul/li/a[@href]')): yield req for req in map(lambda node: func(node, True), sel.xpath( '//div[@id="allcitylist"]/div[contains(@class,"line")]/ul/li/p[@class="hot"]/a[@href]')): yield req
def pages(self, response): """ 提取各种案件的页数,并发起请求 """ sel = Selector(text=response.body) # self.file_haveRequested.write(response.url+"\n") self.cases(response) #提取首页的内容 iscontinue = len(sel.xpath("//div[@id='bottom_right_con_five_xsaj']//ul")) if iscontinue: #如果当前页不为空 try: pages = sel.xpath("//div[@id='bottom_right_con_five_xsaj']//script").re("createPageHTML\(([\d]*?),")[0] baseurl = response.url for i in range(1, int(pages)+1): #fort test fp = self.url_fingerprint(baseurl+"index_"+str(i)+".htm") if fp not in self.url_have_seen: self.url_have_seen.add(fp) yield Request(baseurl+"index_"+str(i)+".htm", callback = self.cases, dont_filter=False) else: pass # yield Request(baseurl+"index_"+str(i)+".htm", # callback = self.cases, dont_filter=False) except Exception, e: log.msg("only_one url==%s== error=%s" %(response.url,\ e), level=log.ERROR)
def parse_location(self, response): sel = Selector(response) print(" **************** LOCATION LIST *************") print(response.url) print(" **************** LOCATION LIST *************") location = sel.xpath("//ul[@class='geoList']") for loc in location: state_link = loc.xpath("li/a/@href").extract() print(" **************** Attraction List starts *************") for link in state_link: url_link = response.urljoin(link) print(url_link) # "https://www.tripadvisor.com/Attractions-g34345-Activities-Key_West_Florida_Keys_Florida.html" yield scrapy.Request(url_link, callback=self.parse_attraction) print(" **************** Attraction List ends *************") # yield scrapy.Request(url_link,callback=self.parse_test) locations = sel.xpath("//a[@class='guiArw sprite-pageNext pid0']/@href").extract() print(" **************** LOCATION LIST PAGINATION starts *************") print(locations) print(" **************** LOCATION Link *************") for location in locations: if location: location_link = response.urljoin(location) print(location_link) yield scrapy.Request(location_link, callback=self.parse_location) print(" **************** LOCATION Link *************") print(" **************** LOCATION LIST PAGINATION ends *************")
def stepOne(self, response): hxs = Selector(response) # untranslatedRows = hxs.xpath('//table[@id="translations"]/tr[@class="preview untranslated priority-normal no-warnings"]/td[@class="original"]') untranslatedRows = hxs.xpath('//table[@id="translations"]/tr[ contains(@class, "untranslated") ]/td[@class="original"]') for rows in untranslatedRows: aux = WordpressTranslationHackItem() aux['originalString'] = '' for r in rows.xpath('./child::node()').extract(): aux['originalString'] = aux['originalString'] + r.strip() + ' ' self.untranslated.append( aux ) # print ( self.untranslated[-1] ) # print ( '------------------' ) # pdb.set_trace() paginaSiguiente = [] paginaSiguiente = hxs.xpath('//div[@class="paging"]/a[@class="next"]/@href') try: fullUrl_toNextPage = response.urljoin( paginaSiguiente[0].extract() ) return fullUrl_toNextPage except Exception: return None
def load_annotations(body): """Create slybot annotations from annotated html.""" if not body: return {'annotations-plugin': {'extracts': []}} sel = Selector(text=add_tagids(body)) existing_ids = set() annotations = [] for elem in sel.xpath('//*[@data-scrapy-annotate]'): attributes = elem._root.attrib annotation = json.loads(unquote(attributes['data-scrapy-annotate'])) if (isinstance(elem._root, _Element) and elem._root.tag.lower() == 'ins'): annotation.update(find_generated_annotation(elem)) else: annotation['tagid'] = attributes.get('data-tagid') if 'id' not in annotation: annotation['id'] = gen_id(disallow=existing_ids) existing_ids.add(annotation['id']) annotations.append(annotation) for elem in sel.xpath('//*[@%s]' % '|@'.join(IGNORE_ATTRIBUTES)): attributes = elem._root.attrib for attribute in IGNORE_ATTRIBUTES: if attribute in attributes: break ignore = {attribute[len('data-scrapy-'):]: True} if 'id' not in ignore: ignore['id'] = gen_id(disallow=existing_ids) existing_ids.add(ignore['id']) annotations.append(ignore) return {'annotations-plugin': {'extracts': annotations}}
def detail(self, response): """ extract detail info """ sel = Selector(text=response.body) condition = sel.xpath(self.xpathSen["brand"]).extract() if len(condition) != 0: xpath_keys = ["type_auto","brand","level","BSX", "CSJG","ZWGS","PL","RLXS","QDFS"] xpath_conf = ["DDTC","DDTJZY","ESP","GPS","DSXH", "DCLD","DGLFXP"] keys_info = [] for xpath_str in xpath_keys: tmp = sel.xpath(self.xpathSen[xpath_str]).extract() try: keys_info.append(tmp[0]) except Exception, e: keys_info.append("") log.msg("error info=%s keys_info=%s" %(e, "\001".join(keys_info)), level=log.ERROR) conf_info = [] for xpath_s in xpath_conf: tmp = sel.xpath(self.xpathSen[xpath_s]).extract() try: conf_info.append(tmp[0]) except Exception, e: conf_info.append("-") log.msg("error info=%s conf_info=%s"%(e, \ "\001".join(conf_info)), level=log.ERROR)
def parse(self, response): zip_file = open('CANADA_ZIPCODES.txt', 'r+') zip_list = filter(None, zip_file.read().split('\n')) for zip_item in zip_list: print "*** zip_item" print zip_item geo_url = 'https://maps.google.com/?q=%s canada'%(zip_item) try: map_url_content = requests.get(geo_url).content except: sleep(15) map_url_content = requests.get(geo_url).content sleep(3) sell = Selector(text=map_url_content) map_error_1 = sell.xpath( '//div[@class="sp-error-msg"]|//div[@class="noprint res"]/div//div[contains(@id,"marker_B")]') latlong = ' '.join(sell.xpath('//script').extract()) if not map_error_1 else '' lat_lng = re.findall(r'",\[(-?\d+\.?\d*),(-?\d+\.?\d*)\]\]', latlong, re.I) venue_latitude, venue_longitude = lat_lng[0] if lat_lng else ('', '') print venue_latitude, venue_longitude if not venue_latitude or not venue_longitude: with open('missing_lat_lng.txt', 'a+') as d: print "*** DROPPED ZIP - %s"%(zip_item) d.write(zip_item+'\n') print "NO LATITUDE OR LONGITUDE" else: fetch_url = 'http://api.invisalign.com/svc/rd?pc=%s&cl=CA&lat=%s&lng=%s&it=us'%(zip_item, venue_latitude, venue_longitude) meta_data = {'venue_latitude': venue_latitude, 'venue_longitude': venue_longitude, 'zip_code': zip_item} yield Request(url = fetch_url, dont_filter=True, callback=self.parse_result, meta=meta_data)
def parse_channel(self, response): hxs = Selector(response) item = response.meta['record'] item['video_url'] = hxs.xpath("body//div[@id='divVideoHolder']/@videosrc").extract()[0] item["title"] = hxs.xpath("body//div[@id='divTitrGrid']/text()").extract()[0] return item
def parse_item(self, response): items = [] sel = Selector(response) print("test1") products = sel.xpath('//*[@id="coreProductInfos"]/div[2]') # breadcrumbs = sel.xpath('//div[@id ="contentWrapper"]')\ table = sel.xpath('//tr[contains(td, "techDataCol")]') category = sel.xpath('//*[@id="contentWrapper"]/div[1]/span[2]/a/span/text()').extract() print(category) for product in products: if 'Geheugen' in category: item = Memory() print (table.xpath('//td/text()').extract()) item['Category'] = category item['Name'] = product.xpath('//td[contains(td[1], "Modelnaam")]/td[2]/table/tbody/tr/td/text()').extract() item['Brand'] = product.xpath('//*[@id="details"]/div[4]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/text()').extract() item['Quantity'] = product.xpath('//tr[contains(td[1], "Aantal")]/td[2]/text()').extract() item['Size'] = product.xpath('//tr[contains(td[1], "Modulegrootte")]/td[2]/text()').extract() item['PriceGB'] = product.xpath('//tr[contains(td[1], "Prijs per GB")]/td[2]/text()').extract() item['Type'] = product.xpath('//tr[contains(td[1], "Geheugentype")]/td[2]/text()').extract() item['Specification'] = product.xpath('//tr[contains(td[1], "Geheugen Specificatie")]/td[2]/text()').extract() item['LowVoltage'] = product.xpath('//tr[contains(td[1], "Low Voltage DDR")]/td[2]/text()').extract() item['Voltage'] = product.xpath('//tr[contains(td[1], "Spanning")]/td[2]/text()'). extract() item['Warranty'] = product.xpath('//tr[contains(td[1], "Fabrieksgarantie")]/td[2]/text()').extract() item['Ean'] = product.xpath('//tr[contains(td[1], "EAN")]/td[2]/text()').extract() item['Sku'] = product.xpath('//tr[contains(td[1], "SKU")]/td[2]/text()').extract() print("Geheugen!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") items.append(item) return items
def parse_state_url(self, response): # draw the state sel = Selector(response) tempcountryname = sel.xpath( '//div[@id="MediaWeatherRegion"]/div[@class="hd"]/div[@class="yom-bread"]/text()').extract() match = re.search(r'[\w\s]+$', tempcountryname[0]) if match: countryname = match.group().strip() else: self.log('没有国家名', log.WARNING) return data_1 = response.meta['data'] for node in sel.xpath('//div[@id="page1"]/ul/li/a'): state_name = node.xpath('./span/text()').extract()[0].strip() state_href = node.xpath('./@href').extract()[0] yield Request(url='https://weather.yahoo.com' + state_href, callback=self.parse_city, meta={'data': {'data_1': data_1, 'countryname': countryname, 'state': state_name}}) country_code = data_1['countrycode'] # Get states and provinces item = YahooCityItem() item['country'] = {'countrycode': country_code, 'countryname': countryname} item['state'] = state_name item['level'] = 1 item['abroad'] = data_1['abroad'] yield item
def parse_job_details(self, response): hxs = Selector(response) item = BrightermondaySampleItem() item['link'] = response.url item['title'] = hxs.xpath('//h2/text()').extract()[0] item['desc'] = hxs.xpath('//article[@class="resultDetail"]/p/text()').extract()[0] return item
def xt(cls, response): persons = [] raw_persons = response.xpath(cls.XPATH).extract() for raw_person in raw_persons: person = Selector(text=raw_person) if person.xpath('//th'): continue source_link = person.xpath( '//td//a/@href').extract()[0] reversed_name = _clean( Selector( text=remove_tags(raw_person, 'img') ).xpath('//td//a/text()').extract()[0]) (pres_start_date, pres_end_date) = cls.xt_pres_date( raw_person) mandate = { 'title': u'RechnungshofpräsidentIn', 'short': u'RH-PräsidentIn', 'start_date': pres_start_date, 'end_date': pres_end_date } persons.append({ 'source_link': source_link, 'reversed_name': reversed_name, 'mandate': mandate, }) return persons
def parse(self, response): sel = Selector(response) data = sel.xpath("//table[@class='infobox']") title = sel.xpath("//header[@id='WikiaPageHeader']//h1/text()").extract()[0].strip() print "===================NAME======================" title = title.replace(" (episode)", "") print title season_episode_str = data.xpath("normalize-space(tr[2]/td[1]/text())").extract()[0] season_id = season_episode_str.split("Season ", 1)[1].rpartition(",")[0] episode_id = season_episode_str.split("episode ", 1)[1] # this title_card is too small. title_card = data.xpath("tr[2]/td/div/div/a/@href").extract()[0] # this doesn't work for some things. Scraping for this has moved to ep_detail_2 # production_code = data.xpath("normalize-space(tr[3]/td/text())").extract()[0] e, e_created = Episode.objects.get_or_create(title=title) e.season_id = season_id e.episode_id = episode_id e.link = response.request.url e.save() # Note for characters. Towards the end, there is /a[1].The [1] is there because I only want the first link. # Sometimes something like Hunson Abadeer (name not revealed until "Return to the Nightosphere") will appear. # Both Hunson Abadeer and Return ... will be a tags, but Return is obviously not a character. characters = sel.xpath("//div[@id='mw-content-text']/*[self::h3 or self::h2][span[@id='Major_characters' or @id='Minor_characters']]/following-sibling::*[1]/li/a[1]/text() | " "//div[@id='mw-content-text']/*[self::h3 or self::h2][span[@id='Major_characters' or @id='Minor_characters']]/following-sibling::*[1]/li/ul/li/a[1]/text()").extract() for char in characters: c, c_created = Character.objects.get_or_create(name=char) e.characters.add(c) print title_card print characters
def stepTwo(self, response): hxs = Selector(response) translatedRows = hxs.xpath('//table[@id="translations"]/tr[ contains(@class, "status-current") ]/td[@class="original"]') # print ( len(untranslatedRows) ) # pdb.set_trace() for rows in translatedRows: aux = "" for r in rows.xpath('./child::node()').extract(): aux = aux + r.strip() + ' ' i = self.compareStrings(aux) if i is not None: #scrapy item # traductionItem = W # traductionItem['originalString'] = aux self.untranslated[i]['translatedString'] = rows.xpath('./..//td[@class="translation foreign-text"]/text()').extract()[0].strip() paginaSiguiente = [] paginaSiguiente = hxs.xpath('//div[@class="paging"]/a[@class="next"]/@href') try: fullUrl_toNextPage = response.urljoin( paginaSiguiente[0].extract() ) return fullUrl_toNextPage except Exception: return None
def parse(self, response): selector = Selector(response) posts = selector.xpath('//div[@class="articleh"]') + selector.xpath('//div[@class="articleh odd"]') for index, post in enumerate(posts): item = GubaPostItem() item['stock_id'] = re.search('\d+', response.url).group(0) item['read_count'] = int(post.xpath('span[@class="l1"]/text()').extract()[0]) item['comment_count'] = int(post.xpath('span[@class="l2"]/text()').extract()[0]) item['username'] = post.xpath('span[@class="l4"]/text()').extract() item['updated_time'] = post.xpath('span[@class="l5"]/text()').extract()[0] link = post.xpath('span[@class="l3"]/a/@href').extract() print item['updated_time'] if link: if link[0].startswith('/'): link = "http://guba.eastmoney.com/" + link[0][1:] else: link = "http://guba.eastmoney.com/" + link[0] item['url'] = link yield Request(url=link, meta={'item': item, 'PhantomJS': True}, callback=self.parse_post) for pagenum in xrange(2, 5): url = response.url.split('_') if len(url) == 1: nextpage = url[0][:-5] + '_' + str(pagenum) + '.html' elif len(url) == 2: nextpage = url[0] + '_' + str(pagenum) + '.html' else: break yield Request(url=nextpage, callback=self.parse)
def parse_item(self, response): selector = Selector(response) companyInfo = selector.xpath('//td[@class="cont_company"]//td[@class="td_r"]/text()') jobInfo = selector.xpath('//*[@id="DataList1"]//table/tr') contactInfo = selector.xpath('//td[@class="cont_contact"]') contact_text = contactInfo.xpath('text()').extract()[0] + ' ' + contactInfo.xpath('text()').extract()[1] + ' ' + contactInfo.xpath('text()').extract()[2] #print self.mailre.findall(contact_text) #print self.phonePartern.match(contactInfo.xpath('text()').extract()[0]) #print self.emainPartern(contactInfo.xpath('text()').extract()[1]) #print (contactInfo.xpath('text()').extract()[2]).replace(' ','') for each in jobInfo: item = TsrcwItem() print each.extract() jobList = [] try: for i in each.xpath('td[@class="td-grey"]/text()'): if not (i.extract()).strip() == "": jobList.append((i.extract()).strip()) item['email'] = self.mailre.findall(contact_text)[0] item['companyName'] = (companyInfo.extract()[0]).strip() item['industryName'] = (companyInfo.extract()[1]).strip() item['companyNature'] = (companyInfo.extract()[2]).strip() item['jobName'] = (each.xpath('td[@class="td-grey"]/a/text()').extract()[0]).strip() item['jobDetail'] = self.baseUrl+(each.xpath('td[@class="td-grey"]/a/@href').extract()[0]).strip() item['jobRegion'] = jobList[0] item['requiredDegree'] = jobList[1] item['salary'] = jobList[2] item['endDate'] = jobList[3] yield item except Exception,e: continue
def parse_item_page(self, response): sel = Selector(response) item = response.meta['item'] review_urls = sel.xpath('//a[@class="a-link-emphasis a-text-bold"]/@href').extract() self.logger.debug('if this product has review: ' + str(len(review_urls))) if review_urls: this_review_url = review_urls[0] ix = 0 while True: yield Request(url=this_review_url, meta={'item': item}, callback=self.parse_review_content_page) response = requests.get(this_review_url) this_review_url_sel = Selector(text=response.text) next_review_urls = this_review_url_sel.xpath('//li[@class="a-last"]/a/@href').extract() self.logger.debug('next_review_urls :'.join(next_review_urls)) if next_review_urls: this_review_url = next_review_urls[0] this_review_url = 'http://www.amazon.com' + this_review_url self.logger.debug(this_review_url) ix += 1 self.logger.debug(ix) else: break else: yield Request(url=response.url, meta={'item': item}, callback=self.return_invalid_review)
def parse_item(self, response): video = ItemLoader(item=YoutubeVideoItem(), response=response) info = response.xpath( '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[5]/div[2]/ytd-video-primary-info-renderer/div/div/div[3]/ytd-sentiment-bar-renderer/paper-tooltip/div/text()' ).get() comments = response.xpath('//*[@id="body"]').getall() if info is not None: info = info.split("/") video.add_value('url', '') video.add_value( 'date', response.xpath( '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[5]/div[2]/ytd-video-primary-info-renderer/div/div/div[1]/div[2]/yt-formatted-string/text()' ).get()) video.add_value( 'title', response.xpath( '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[5]/div[2]/ytd-video-primary-info-renderer/div/h1/yt-formatted-string/text()' ).get()) video.add_value( 'views', response.xpath( '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[5]/div[2]/ytd-video-primary-info-renderer/div/div/div[1]/div[1]/yt-view-count-renderer/span[1]/text()' ).get()) video.add_value( 'category', response.xpath( '//*[@class="content content-line-height-override style-scope ytd-metadata-row-renderer"]/a/text()' ).get()) if info is None: info = [] for i in range(0, 2): info.append("0") video.add_value('likes', info[0].strip()) video.add_value('dislikes', info[1].strip()) for comment in comments: s = Selector(text=comment) item = ItemLoader(item=YoutubeCommentItem(), response=response) item.add_value('id', s.xpath('//*[@id="author-text"]/@href').get()) item.add_value( 'date', s.xpath( '//*[@class="published-time-text above-comment style-scope ytd-comment-renderer"]/a/text()' ).get()) item.add_value( 'name', self.clean( s.xpath('//*[@id="author-text"]/span/text()').get())) item.add_value( 'picture', s.xpath( '//div[@id="author-thumbnail"]/a/yt-img-shadow/img/@src'). get()) item.add_value('content', s.xpath('//*[@id="content-text"]/text()').get()) item.add_value( 'likes', self.clean( s.xpath('//*[@id="vote-count-middle"]/text()').get())) video.add_value('comments', item.load_item()) yield video.load_item()
# encoding: utf-8 from scrapy import Selector __author__ = 'mtianyan' __date__ = '2018/1/25 0025 21:26' import requests response = requests.get( "https://www.aqistudy.cn/historydata/daydata.php?city=%E6%9C%AC%E6%BA%AA&month=201502" ) sel = Selector(response) list = sel.xpath("//tr").extract()[1:] pass
def parse_coin_detail_info(self, response): selector = Selector(response) coin = Coin() # current price 当前价格数据 coin_price = selector.xpath('//div[@class="coinprice"]').extract() current_price = re.findall(r'<div class="coinprice">(.*?)<span', coin_price[0], re.S) if len(current_price) is not 0: coin['price'] = current_price[0] coin['time'] = datetime.utcnow().replace(tzinfo=utc) print(coin['price'], ' ', coin['time']) # lowest price and highest price最高和最低价格数据 low_height = selector.xpath('//div[@class="lowHeight"]').extract() prices = re.findall( r'<div class="lowHeight">.*?<span class="value">(.*?)</span></div>.*?<div>.*?<span class="value">(.*?)</span></div>', low_height[0], re.S) if len(prices) is not 0: coin['highest_price'] = prices[0][0] coin['lowest_price'] = prices[0][1] print(coin['highest_price'], ' ', coin['lowest_price']) # description币的描述数据 desc = selector.xpath('//div[@class="des"]/a').extract() description = re.findall(r'<a href="(.*?)" target="_blank">', desc[0], re.S) if len(description) is not 0: desc_url = base_url + description[0] print(desc_url) response = requests.get(desc_url) desc_selector = Selector(response) desc_content = desc_selector.xpath( '//div[@class="boxContain"]/div/p').extract() coin['description'] = self.tool.replace(''.join( i.strip() for i in desc_content)) print(coin['description']) # market市场相关信息 market = selector.xpath( '//div[@id="baseInfo"]/div[@class="firstPart"]/div/div[@class="value"]' ).extract() values = [] for value in market: market_value = re.findall(r'<div class="value">(.*?)<', value, re.S) values.append(market_value[0]) if len(values) is not 0: coin['market_capitalization'] = values[0] # 流通市值 coin['market_count'] = values[1] # 流通量 coin['publish_count'] = values[2] # 发行量 coin['tx_count'] = values[3] # 交易额 print(coin['market_capitalization'], ' ', coin['market_count'], ' ', coin['publish_count'], ' ', coin['tx_count']) # base info列表基本信息数据 items = selector.xpath( '//div[@id="baseInfo"]/div[@class="secondPark"]/ul/li').extract() for item in items: base_info = re.findall( r'<li>.*?<span class="tit">(.*?)</span>.*?<span class="value">(.*?)</span>.*?</li>', item, re.S) if len(base_info) is not 0: if base_info[0][0] == '英文名:': coin['english_name'] = self.tool.replace( base_info[0][1]).strip() print(coin['english_name']) elif base_info[0][0] == '中文名:': coin['chinese_name'] = self.tool.replace( base_info[0][1]).strip() print(coin['chinese_name']) elif base_info[0][0] == '上架交易所:': coin['exchanger_count'] = self.tool.replace( base_info[0][1]).strip() print(coin['exchanger_count']) elif base_info[0][0] == '发行时间:': coin['publish_time'] = self.tool.replace( base_info[0][1]).strip() print(coin['publish_time']) elif base_info[0][0] == '白皮书:': coin['white_paper'] = self.tool.replace( base_info[0][1]).strip() print(coin['white_paper']) elif base_info[0][0] == '网站:': websites = re.findall( r'<a href="(.*?)" rel="nofollow" target="_blank">', base_info[0][1], re.S) if len(websites) is not 0: office_websites = [] for website in websites: office_websites.append( self.tool.replace(website).strip()) coin['website'] = office_websites print(coin['website']) elif base_info[0][0] == '区块站:': explorers = [] block_explorers = re.findall( r'<a href="(.*?)" rel="nofollow" target="_blank">', base_info[0][1], re.S) if block_explorers is not []: for block_explorer in block_explorers: explorers.append( self.tool.replace(block_explorer).strip()) coin['block_explorer'] = explorers print(coin['block_explorer']) elif base_info[0][0] == '是否代币:': coin['is_token'] = self.tool.replace( base_info[0][1]).strip() print(coin['is_token']) elif base_info[0][0] == '众筹价格:': ico_price = re.findall(r'<a href="#ico">(.*?)</a>', base_info[0][1], re.S) coin['ico_price'] = self.tool.replace(ico_price[0]).strip() print(coin['ico_price']) yield coin
def download(): try: cur.execute( "select class_id, doc_website from jdk_class where class_id <= 4240" ) lists = cur.fetchall() for every in lists: print every[0] while 1 == 1: try: sel = Selector(requests.get(every[1], timeout=10)) except Exception, e: print 'timeout' continue break block_list = sel.xpath('//div[@class="details"]/ul/li/ul') for block in block_list: details = block.xpath('li/ul') for each in details: full_declaration = each.xpath('li/pre').extract()[0] method_name = each.xpath('li/h4/text()').extract()[0] # print method_name cur.execute( "select method_id from jdk_method where full_declaration = '" + full_declaration + "' and name = '" + method_name + "' and class_id = " + str(every[0])) method_id = cur.fetchall()[0][0] # print "method_id: " + str(method_id) if each.xpath('li/dl/dt/span[@class="throwsLabel"]'): if each.xpath( 'li/dl/dt/span[@class="throwsLabel"]/text()' ).extract()[0] == "Throws:": exception_count = 0 exceptions = [] following_tags_dd = each.xpath( 'li/dl/dt/span[@class="throwsLabel"]/parent::*/following-sibling::dd' ) if each.xpath( 'li/dl/dt/span[@class="throwsLabel"]/parent::*/following-sibling::dt' ): following_tags_dt = each.xpath( 'li/dl/dt/span[@class="throwsLabel"]/parent::*/following-sibling::dt' ) next_dt = following_tags_dt[0] preceding_tags_dd = next_dt.xpath( 'preceding-sibling::dd') set_following_tags_dd = set( list(following_tags_dd.extract())) set_preceding_tags_dd = set( list(preceding_tags_dd.extract())) exceptions = list(set_following_tags_dd & set_preceding_tags_dd) exception_count = len(exceptions) else: exception_count = len(following_tags_dd) exceptions = following_tags_dd.extract() print exception_count print exceptions for ex in exceptions: temp_str = ex[:ex.find("</code>")] temp_str = temp_str.replace( "<dd>", "").replace("<code>", "").replace("</a>", "") exception_class = temp_str[temp_str.find(">") + 1:] print exception_class description = ex[ex.find("</code>") + 9:].replace( "</dd>", "").replace( "\n", "").replace( " ", "").strip() if description == "dd>": description = '' print description cur.execute( "insert into jdk_exception(name, class_id, method_id, description) values(%s, %s, %s, %s)", (exception_class, every[0], method_id, description)) conn.commit() except Exception, e: print Exception, ":", e
def parse(html, source_url=u''): response = Selector(text=html) # 处理内容区 content_html = response.xpath(u'//div[@class="lph-article-comView"]') if not content_html: return # 去除内部不需要的标签 content_items = content_html.xpath(u'*[not(name(.)="script") ' u' and not(name(.)="style")' u' and not(name(.)="a")' u' and not(name(.)="iframe")]|text()') if not content_items: return # 处理时间 post_date = response.xpath(u'//td[@class="time"]/text()').extract_first( u'').strip() # 处理作者 post_user = response.xpath(u'//a[@rel="nofollow"]/text()').extract_first( u'') # 处理来源 src_ref = u'雷锋网' # 组装新的内容标签 content_html = u"""<div class="lphArticle-detail"> <div class="lph-article-comView"> %s </div> </div> """ % (u''.join(content_items.extract()), ) content_html = content_html.replace(u'https://static.leiphone.com/uploads/new/category/pic/201801/5a5dd347356f7' u'.jpg?imageMogr2/thumbnail/!740x140r/gravity/Center/crop/740x140/quality/90' u'', u'')\ .replace(u'雷锋网原创文章,未经授权禁止转载。详情见。', '')\ .replace(u'雷锋网原创文章,未经授权禁止转载。详情见', '')\ .replace(u'<a href="http://dwz.cn/4ErMxZ" rel="nofollow" target="_blank">转载须知</a>。', u'') \ .replace(u'转载须知。', u'') \ .replace(u'转载须知', u'') \ .replace(u'雷锋网版权文章,未经授权禁止转载。详情见。', u'')\ .replace(u'雷锋网版权文章,未经授权禁止转载。详情见', u'') # 去除不要的标签内容 clear_paths_in = [] style_in_list = [] style_need_replace = [] title = response.xpath( u'//meta[@property="og:title"]/@content | //title/text()' ).extract_first(u'') content_item = { u'title': title, u'content_html': content_html, u'post_date': post_date, u'style_in_list': style_in_list, u'style_need_replace': style_need_replace, u'clear_paths_in': clear_paths_in } return content_item
def parse(self, response): root = Selector(response) item = HorseRacingItem() for each in root.xpath('//select[@id="raceDateSelect"]'): item['date_pages'] = each.xpath('.//option/@value').extract() return item
def parse_house_info(self, resp): """ 解析二手房信息 :return: """ item = dict() response = Selector(resp) generalXpath = "//span[text()='{}']/../text()" # 链家编号 item['houseCode'] = response.xpath( "//div[@class='houseRecord']/span[2]/text()").extract_first( "").strip() # 小区名 item['houseName'] = response.xpath( "//div[@class='communityName']/a[1]/text()").extract_first( "").strip() # 朝向 item['houseDirection'] = response.xpath( generalXpath.format("房屋朝向")).extract_first("").strip() # 户型 item['houseType'] = response.xpath( generalXpath.format("房屋户型")).extract_first("").strip() # 电梯 item['houseElevator'] = response.xpath( generalXpath.format("配备电梯")).extract_first("").strip() # 区域 item['houseAddress'] = response.xpath( "//div[@class='areaName']/a/text()").extract_first("").strip() item['houseDistrict'] = response.xpath( "//div[@class='areaName']/span[@class='info']/a[2]/text()" ).extract_first("").strip() item['houseRegion'] = response.xpath( "//div[@class='areaName']/span[@class='info']/a[1]/text()" ).extract_first("").strip() # 楼层 item['houseFloor'] = response.xpath( generalXpath.format("所在楼层")).extract_first("").strip() # 建筑面积 item['houseSize'] = response.xpath( generalXpath.format("建筑面积")).extract_first("").strip() # 装修情况 item['houseStatus'] = response.xpath( generalXpath.format("装修情况")).extract_first("").strip() # 每平米价格 item['houseUnitPrice'] = response.xpath( "//span[@class='unitPriceValue']/text()").extract_first( "").strip() # 总价 item['houseAllPrice'] = response.xpath( "//div[@class='price ']/span[@class='total']/text()" ).extract_first("").strip() # 建设时间 item['houseYear'] = response.xpath( "//div[@class='area']/div[@class='subInfo']/text()").re_first( r"(\d+)") # 原文链接 item['url'] = resp.url # 经纬度 postions = self.pattern_position.search(resp.text) # 获取坐标 item['Longitude'] = postions.group(1) item['Latitude'] = postions.group(2) self.db.update_set('houseCode', item) self.lianjia_spider_log.info(f'parse item success:{resp.url}')
</tr> </table> </div> </body> </html> ''' #xpath定位一个tap并不是唯一的,可以用很多xpath定位一个tap #获取一个html的xpath sel = Selector(text=html) #xpath是让整个tap都是可配置的 #因为tap可能会随时变,这样通过改变量,方便管理 age_name_xpath = "//div[1]/div/p[1]/text()" age_name_tap = sel.xpath(age_name_xpath).extract() #加个if是防止报错 if age_name_tap: name = age_name_xpath[0] #使用class时,需要把全部的class都包含了才行 teacher_tap = sel.xpath( "//div[@class='teacher_info info']/p/text()").extract()[0] #可以使用contains方法 teacher_tap = sel.xpath( "//div[contains(@class,'teacher_info')]/p/text()").extract()[0] #找到class元素,用@class teacher_class = sel.xpath( "//div[contains(@class,'teacher_info')]/@class").extract()[0]
def parse(self, response): sites = json.loads(response.text) spider_name = response.meta['spider_name'] #网页html data = sites["items_html"] min_position = sites["min_position"] #第一条twitter position = '' if 'max_position' in sites: position = sites["max_position"] else: position = min_position.split('-')[2] if data == "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n": print ("抓取完成!!!,更新种子") self.db.updateSeedTag(spider_name) self.db.updateSeedCountLocation(spider_name, position) else: #是否还有下一页 #has_more_items = sites["has_more_items"] item = SpiderTwitterItem() # 获得贴文作者 twitter_author = re.compile('data-name="(.+)" data-user-id=').findall(data)[0] selector_app = Selector(text=data) twitter_group = selector_app.xpath("//li[@class='js-stream-item stream-item stream-item\n']").extract() twitter_group_count = len(twitter_group) next_page_id = "" for twitter_personal in twitter_group: selector_content = Selector(text=twitter_personal) twitter_id = selector_content.xpath("//li[@class='js-stream-item stream-item stream-item\n']/@data-item-id").extract() if len(twitter_id) > 0: next_page_id = twitter_id[0] if self.db.getTwitterById(next_page_id): # 判断是否是爬取到之前记录位置 if self.db.isSeedLocation(spider_name, next_page_id): print ("%s最新推文抓取完毕"%spider_name) self.db.updateSeedCountLocation(spider_name, position) return print ("%s已存在,进行去重过滤"%next_page_id) continue else: item['twitter_id'] = twitter_id else: item['twitter_id'] = '' twitter_content_whole = "" twitter_content_list = selector_content.xpath("//div[@class='js-tweet-text-container']").extract() for twitter_content in twitter_content_list: selector_content_text = Selector(text=twitter_content) twitter_content_text = selector_content_text.xpath("//text()").extract() twitter_content_text_num = len(twitter_content_text) for i in range(twitter_content_text_num): if twitter_content_text[i] != " " and twitter_content_text[i] != "\n ": twitter_content_add = twitter_content_text[i].replace("\n","") twitter_content_whole += twitter_content_add twitter_content_whole_trun = twitter_content_whole.replace('"','\\"') twitter_href = selector_content.xpath("//small[@class='time']/a/@href").extract() twitter_time = selector_content.xpath("//small[@class='time']/a/@title").extract() twitter_num = selector_content.xpath("//span[@class='ProfileTweet-actionCountForAria']/text()").extract() if len(twitter_num) > 0: twitter_reply = twitter_num[0] twitter_trunsmit = twitter_num[1] twitter_zan = twitter_num[2] else: twitter_reply = '' twitter_trunsmit = '' twitter_zan = '' twitter_img = selector_content.xpath("//div[@class='AdaptiveMedia-photoContainer js-adaptive-photo ']/@data-image-url").extract() print ("目标:%s" % twitter_id[0]) print ("内容:%s" % twitter_content_whole_trun) if len(twitter_author) > 0: author = twitter_author item['twitter_author'] = author else: item['twitter_author'] = '' if len(twitter_id) > 0: tw_id = twitter_id[0] item['twitter_id'] = tw_id else: item['twitter_id'] = '' if twitter_content_whole: content = twitter_content_whole_trun item['twitter_content'] = content else: item['twitter_content'] = '' if len(twitter_href) > 0: href = "https://twitter.com%s"%twitter_href[0] item['twitter_href'] = href else: item['twitter_href'] = '' if len(twitter_time) > 0: time = twitter_time[0] item['twitter_time'] = time else: item['twitter_time'] = '' if len(twitter_num) > 0: reply = twitter_reply item['twitter_reply'] = reply else: item['twitter_reply'] = '' if len(twitter_num) > 0: trunsmit = twitter_trunsmit item['twitter_trunsmit'] = trunsmit else: item['twitter_trunsmit'] = '' if len(twitter_num) > 0: zan = twitter_zan item['twitter_zan'] = zan else: item['twitter_zan'] = '' if len(twitter_img) == 1: img = twitter_img[0] item['twitter_img'] = img elif len(twitter_img) > 1: img_list = [] for img in twitter_img: img_list.append(img) item['twitter_img'] = img_list else: item['twitter_img'] = '' yield item print ("下一页等待中...") #has_more_items 为true 代表还有下一页 yield Request(url=self.next_page_url.format(spider_name,self.now_time, next_page_id, position), callback=self.parse,headers={'Referer': "https://twitter.com/"}, meta={'spider_name': spider_name})
def parse(self, response): driver = response.meta['driver'] driver.maximize_window() driver.execute_script("window.open('');") driver.switch_to.window(driver.window_handles[0]) for _, value in self.df.iterrows(): cntr = 199 while True: location = value['Location'] category = value['Category'] subCat = value['Subcategory'] url = f"{value['URL']}{cntr}" driver.get(url) cntr += 1 WebDriverWait(driver, 15).until( EC.visibility_of_element_located(( By.XPATH, "(//div[@class='js-project-group'])[2]//h3/parent::a[@class='soft-black mb3']" ))) html = driver.page_source respObj = Selector(text=html) count = respObj.xpath( "normalize-space(//b[contains(@class, 'count')]/text())" ).get() pCount = int("".join(re.findall(r'\d+', count))) driver.switch_to.window(driver.window_handles[1]) items = respObj.xpath( "(//div[@class='js-project-group'])[2]//h3/parent::a[@class='soft-black mb3']" ) for item in items: title = item.xpath("normalize-space(.//h3/text())").get() if title not in self.li: self.li.append(title) url = item.xpath(".//@href").get() driver.get(url) time.sleep(1) WebDriverWait(driver, 15).until( EC.visibility_of_element_located(( By.XPATH, "//a[@data-modal-title='About the creator']"))) html1 = driver.page_source respObj1 = Selector(text=html1) title = respObj1.xpath( "normalize-space(//h2/span/a/text())").get() creator = respObj1.xpath( "normalize-space(//a[@data-modal-title='About the creator']/text())" ).get() backers = respObj1.xpath( "normalize-space(//b[contains(text(), 'backers')]/text())" ).get() money = respObj1.xpath( "normalize-space(//span[@class='money']/text())" ).get() driver.find_element_by_xpath( "//a[@data-modal-title='About the creator']" ).click() time.sleep(2) html2 = driver.page_source respObj2 = Selector(text=html2) yield { 'Title': title, 'Creator': creator, 'Backers': backers.replace(" backers", ""), 'Money': money, 'Website': respObj2.xpath( "//h4[contains(text(), 'Websites')]/following-sibling::ul/li/a/@href" ).getall(), 'Location': location, 'Category': category, 'Sub Category': subCat } else: pass driver.switch_to.window(driver.window_handles[0]) a = pCount // 12 if pCount % 12 != 0: a += 1 else: a += 0 if cntr > 200: break
def parse(self, response): user_item = UserItem() user_item['crawl_time'] = int(time.time()) selector = Selector(response) user_item['_id'] = re.findall('(\d+)/info', response.url)[0] user_info_text = ";".join( selector.xpath('body/div[@class="c"]//text()').extract()) nick_name = re.findall('昵称;?:?(.*?);', user_info_text) gender = re.findall('性别;?:?(.*?);', user_info_text) place = re.findall('地区;?:?(.*?);', user_info_text) brief_introduction = re.findall('简介;?:?(.*?);', user_info_text) birthday = re.findall('生日;?:?(.*?);', user_info_text) sex_orientation = re.findall('性取向;?:?(.*?);', user_info_text) sentiment = re.findall('感情状况;?:?(.*?);', user_info_text) vip_level = re.findall('会员等级;?:?(.*?);', user_info_text) authentication = re.findall('认证;?:?(.*?);', user_info_text) labels = re.findall('标签;?:?(.*?)更多>>', user_info_text) if nick_name and nick_name[0]: user_item["nick_name"] = nick_name[0].replace(u"\xa0", "") if gender and gender[0]: user_item["gender"] = gender[0].replace(u"\xa0", "") if place and place[0]: place = place[0].replace(u"\xa0", "").split(" ") user_item["province"] = place[0] if len(place) > 1: user_item["city"] = place[1] if brief_introduction and brief_introduction[0]: user_item["brief_introduction"] = brief_introduction[0].replace( u"\xa0", "") if birthday and birthday[0]: user_item['birthday'] = birthday[0] if sex_orientation and sex_orientation[0]: if sex_orientation[0].replace(u"\xa0", "") == gender[0]: user_item["sex_orientation"] = "同性恋" else: user_item["sex_orientation"] = "异性恋" if sentiment and sentiment[0]: user_item["sentiment"] = sentiment[0].replace(u"\xa0", "") if vip_level and vip_level[0]: user_item["vip_level"] = vip_level[0].replace(u"\xa0", "") if authentication and authentication[0]: user_item["authentication"] = authentication[0].replace( u"\xa0", "") if labels and labels[0]: user_item["labels"] = labels[0].replace(u"\xa0", ",").replace(';', '').strip(',') education_info = selector.xpath('//div[contains(text(),"学习经历")]/following-sibling::div[1]'). \ xpath('string(.)').extract() if education_info: user_item['education'] = education_info[0].replace(u"\xa0", "") work_info = selector.xpath('//div[contains(text(),"工作经历")]/following-sibling::div[1]'). \ xpath('string(.)').extract() if work_info: user_item['work'] = work_info[0].replace(u"\xa0", "") request_meta = response.meta request_meta['item'] = user_item yield Request(self.base_url + '/u/{}'.format(user_item['_id']), callback=self.parse_further_information, meta=request_meta, dont_filter=True, priority=1)
class JCpenneySpider(BaseCheckoutSpider): name = 'jcpenney_checkout_products' allowed_domains = ['jcpenney.com' ] # do not remove comment - used in find_spiders() SHOPPING_CART_URL = 'http://www.jcpenney.com/jsp/cart/viewShoppingBag.jsp' CHECKOUT_PAGE_URL = "https://www.jcpenney.com/dotcom/" \ "jsp/checkout/secure/checkout.jsp" def start_requests(self): yield scrapy.Request('http://www.jcpenney.com/') def _get_colors_names(self): swatches = self._find_by_xpath( '//ul[@class="small_swatches"]' '/li[not(@class="sku_not_available_select")]' '//a[not(span[@class="no_color"]) and ' 'not(span[@class="color_illegal"])]/img') return [x.get_attribute("name") for x in swatches] def select_size(self, element=None): default_attr_xpath = '*//div[@id="skuOptions_size"]//' \ 'li[@class="sku_select"]' avail_attr_xpath = '*//*[@id="skuOptions_size"]//' \ 'li[not(@class="sku_not_available" or @class="sku_illegal")]/a' self.select_attribute(default_attr_xpath, avail_attr_xpath, element) def select_color(self, element=None, color=None): default_attr_xpath = '*//li[@class="swatch_selected"]' avail_attr_xpath = ('*//*[@class="small_swatches"]' '//a[not(span[@class="no_color"]) and ' 'not(span[@class="color_illegal"])]') if color and color in self.available_colors: default_attr_xpath = '*//*[@class="small_swatches"]//a' \ '[img[@name="%s"]]' % color self.select_attribute(default_attr_xpath, avail_attr_xpath, element) self._find_by_xpath('//h1')[0].click() time.sleep(1) def click_condition(self, default_xpath, all_xpaths): return self._find_by_xpath(default_xpath) or self._find_by_xpath( all_xpaths) def select_attribute(self, default_attr_xpath, avail_attr_xpath, element): max_retries = 20 retries = 0 if self.click_condition(default_attr_xpath, avail_attr_xpath): self._click_attribute(default_attr_xpath, avail_attr_xpath, element) while self.driver.find_elements( By.ID, 'page_loader') and retries < max_retries: time.sleep(1) retries += 1 print(inspect.currentframe().f_back.f_code.co_name) def select_width(self, element=None): default_attr_xpath = '*//div[@id="skuOptions_width"]//' \ 'li[@class="sku_select"]' avail_attr_xpath = '*//*[@id="skuOptions_width"]//' \ 'li[not(@class="sku_not_available" or @class="sku_illegal")]/a' self.select_attribute(default_attr_xpath, avail_attr_xpath, element) def select_waist(self, element=None): default_attr_xpath = ( '*//*[@id="skuOptions_waist"]//li[@class="sku_select"]') avail_attr_xpath = ('*//*[@id="skuOptions_waist"]//' 'li[not(@class="sku_not_available" ' 'or @class="sku_illegal")]') self.select_attribute(default_attr_xpath, avail_attr_xpath, element) def select_inseam(self, element=None): default_attr_xpath = ( '*//*[@id="skuOptions_inseam"]//li[@class="sku_select"]') avail_attr_xpath = ('*//*[@id="skuOptions_inseam"]//' 'li[not(@class="sku_not_available" ' 'or @class="sku_illegal")]') self.select_attribute(default_attr_xpath, avail_attr_xpath, element) def select_neck(self, element=None): default_attr_xpath = ( '*//*[@id="skuOptions_neck size"]//li[@class="sku_select"]') avail_attr_xpath = ('*//*[@id="skuOptions_neck size"]//' 'li[not(@class="sku_not_available" ' 'or @class="sku_illegal")]') self.select_attribute(default_attr_xpath, avail_attr_xpath, element) def select_sleeve(self, element=None): default_attr_xpath = ( '*//*[@id="skuOptions_sleeve"]//li[@class="sku_select"]') avail_attr_xpath = ('*//*[@id="skuOptions_sleeve"]//' 'li[not(@class="sku_not_available" ' 'or @class="sku_illegal")]') self.select_attribute(default_attr_xpath, avail_attr_xpath, element) def _parse_attributes(self, product, color, quantity): time.sleep(10) self.select_color(product, color) self.select_size(product) self.select_width(product) self.select_waist(product) self.select_inseam(product) self.select_neck(product) self.select_sleeve(product) self._set_quantity(product, quantity) def _get_products(self): return self._find_by_xpath( '//*[@id="regularPP"]|//*[contains(@class,"product_row")]') def _add_to_cart(self): addtobagbopus = self._find_by_xpath('//*[@id="addtobagbopus"]') addtobag = self._find_by_xpath('//*[@id="addtobag"]') if addtobagbopus: self._click_on_element_with_id('addtobagbopus') elif addtobag: self._click_on_element_with_id('addtobag') time.sleep(5) def _do_others_actions(self): skip_this_offer = self._find_by_xpath( '//a[contains(@href,"javascript:skipThisOffer")]') if skip_this_offer: skip_this_offer[0].click() time.sleep(4) def _set_quantity(self, product, quantity): quantity_option = Select( self.driver.find_element_by_xpath('*//*[@name="prod_quantity"]')) try: quantity_option.select_by_value(str(quantity)) quantity_selected = quantity_option.first_selected_option.text if quantity_selected != str(quantity): time.sleep(4) self.log('Quantity "{}" selected'.format(quantity)) except: pass def _get_product_list_cart(self): time.sleep(1) self.page_source = self.driver.page_source self.page_selector = Selector(text=self.page_source) try: item_info = re.findall('var jcpORDERJSONjcp = (\{.+?\});', self.page_source, re.MULTILINE)[0] self.item_info = json.loads(item_info) return self.item_info except IndexError: return None def _get_products_in_cart(self, product_list): return product_list.get('purchasedItems') def _get_subtotal(self): return self.item_info.get('merchantTotalWithSavings') def _get_total(self): return self.item_info.get('orderTotal') def _get_item_name(self, item): return item.get('displayName') def _get_item_id(self, item): return item.get('itemNumber')[2:] def _get_item_price(self, item): return str(item.get('lineTotalPrice')) def _get_item_price_on_page(self, item): price_on_page_from_json = float(item.get('lineUnitPrice')) price_on_page_from_html = self.page_selector.xpath( '//span[contains(@data-anid, "product_CurrentSellingPrice")]/text()' ).re(FLOATING_POINT_RGEX) price_on_page_from_html = float(is_empty(price_on_page_from_html, 0)) return price_on_page_from_json if price_on_page_from_json >= 0 else price_on_page_from_html def _get_item_color(self, item): selector = scrapy.Selector(text=self.page_source) color_new = is_empty( selector.xpath( '//span[@class="size" and ' 'contains(text(),"color:")]/text()').re('color\:\n(.+)')) color_old = is_empty( selector.xpath( '//span[@class="size" and contains(text(),"color:")]' '/strong/text()').extract()) return color_new or color_old def _get_item_quantity(self, item): return item.get('quantity') def _enter_promo_code(self, promo_code): self.log('Enter promo code: {}'.format(promo_code)) promo_field = self._find_by_xpath('//*[@id="cr-code"]')[0] promo_field.send_keys(promo_code) time.sleep(2) promo_field.send_keys(Keys.ENTER) time.sleep(5) self.driver.refresh() time.sleep(5) self.item_info = self._get_product_list_cart() def _remove_promo_code(self): self.log('Remove promo code') try: remove_field = self._find_by_xpath( '//a[@title="remove" and @class="cr-remove"]')[0] remove_field.click() time.sleep(10) except IndexError: self.log('Invalid promo code') def _get_promo_total(self): return self._get_total() def _get_promo_subtotal(self): return str(self._get_subtotal()) def _parse_no_longer_available(self): return bool(self._find_by_xpath('//*[@class="error_holder"]'))
def fetch_userdata(self, url): user = YelpUser() response = requests.get(url) page = Selector(response) user.yelp_id = url[url.rfind('=') + 1:] user.name = page.xpath( '//div[@class="user-profile_info arrange_unit"]/h1/text()' ).extract_first() user.location = page.xpath( '//div[@class="user-profile_info arrange_unit"]/h3/text()' ).extract_first() user.tagline = page.xpath( '//p[@class="user-tagline"]/text()').extract_first() user.friends_count = page.xpath( '//li[@class="friend-count"]/strong/text()').extract_first() user.reviews_count = page.xpath( '//li[@class="review-count"]/strong/text()').extract_first() user.photos_count = page.xpath( '//li[@class="photo-count"]/strong/text()').extract_first() user.image_url = page.xpath( '//div[@class="user-profile_avatar"]//img/@src').extract_first() if (MUST_DOWNLOAD_USER_IMAGE): if (os.path.exists(BASE_DIR + '/UserImages') == False): os.mkdir(BASE_DIR + '/UserImages') with open(BASE_DIR + 'UserImages/' + user.yelp_id + '.jpg', 'wb') as f: f.write(requests.get(user.image_url)) user.image_path = BASE_DIR + 'UserImages/' + user.yelp_id + '.jpg' sidebar = page.xpath('//div[@class="user-details-overview_sidebar"]') extra_data = {} for ysection in sidebar.xpath('.//div[@class="ysection"]'): key = ysection.xpath('.//h4/text()').extract_first() if (key == 'Rating Distribution'): starts_distribution = ysection.xpath( './/td[@class="histogram_count"]/text()').extract() extra_data[key] = dict() extra_data[key]['5 stars'] = starts_distribution[0] extra_data[key]['4 stars'] = starts_distribution[1] extra_data[key]['3 stars'] = starts_distribution[2] extra_data[key]['2 stars'] = starts_distribution[3] extra_data[key]['1 stars'] = starts_distribution[4] elif (key == 'Review Votes' or key == 'Stats'): items = ysection.xpath('.//ul/li') items_title = ysection.xpath( './/ul/li/text()[not(normalize-space(.)="")]').extract() for item in items_title: item = item.strip() extra_data[key] = dict() for title, item in dict(zip(items_title, items)).items(): extra_data[key][title.strip()] = item.xpath( './/strong/text()').extract_first() elif (key.find('Compliments') != -1): items = ysection.xpath('.//li') extra_data['Compliments'] = dict() for item in items: compliment = item.xpath('.//span/@class').extract_first() extra_data['Compliments'][ self.compliments[compliment]] = item.xpath( './/small/text()').extract_first() user.meta = json.dumps(extra_data) return user
<NOSCRIPT><em>Địa chỉ email này được bảo vệ bởi JavaScript.<BR>Bạn cần kích hoạt Javascript để có thể xem.</em></NOSCRIPT> </div> """ sel = Selector(text=t) # address_str = u'Địa chỉ' # print address_str # # address_str = u'dcdc' # print sel.xpath('//*[@class="left-detail"]/div[contains(., \''+ address_str +'\')]/div[2]//text()').extract() # //*[@id="product-detail"]/div[8]/table/tbody/tr/td[1]/div/div[2]/div[2]/div[2] # //*[@id="product-detail"]/div[8]/table/tbody/tr/td[1]/div/div[2] email_data = sel.xpath('//*[@class="right"]/script//text()').extract_first() from HTMLParser import HTMLParser h = HTMLParser() import re email_extract = re.search(r"mailto\:(.*)'", email_data) if email_extract.group(1): email = email_extract.group(1) email = h.unescape(email) print email
def parse(self, response): page = Selector(response) review_boxes = page.xpath( '//ul[@class="ylist ylist-bordered reviews"]/li') del review_boxes[0] for review_box in review_boxes: rv = Review() rv.business_id = self.biz_id rv.user_id = review_box.xpath( './/li[@class="user-name"]/a/@href').extract_first() if rv.user_id != None: user_url = rv.user_id rv.user_id = rv.user_id[rv.user_id.rfind("=") + 1:] if (self.session.query(YelpUser).filter( YelpUser.yelp_id == rv.user_id).count() == 0): user = self.fetch_userdata('https://www.yelp.com' + user_url) self.session.add(user) else: user = YelpUser() user.yelp_id = None user.name = "Qype User" user.location = review_box.xpath( './/li[@class="user-location responsive-hidden-small"]/b/text()' ).extract_first().strip() user.photos_count = review_box.xpath( './/li[@class="photo-count responsive-small-display-inline-block"]/b/text()' ).extract_first() user.friends_count = review_box.xpath( './/li[@class="friend-count responsive-small-display-inline-block"]/b/text()' ).extract_first() user.reviews_count = review_box.xpath( './/li[@class="review-count responsive-small-display-inline-block"]/b/text()' ).extract_first() user.meta = None self.session.add(user) rv.text = review_box.xpath( './/div[@class="review-content"]/p/text()').extract_first() rv.rating = review_box.xpath( './/div[@class="review-content"]/div[@class="biz-rating biz-rating-large clearfix"]/div/div/@title' ).extract_first() rv.rating = rv.rating[0:rv.rating.find(" ")] rv.date = review_box.xpath( './/div[@class="review-content"]/span[@class="rating-qualifier"]/text()' ).extract_first() self.session.add(rv) if (self.session.query(CrawlData).filter( CrawlData.url == response.url).count() != 0): crawl_data = CrawlData() crawl_data.body = response.body crawl_data.requestHeader = str(response.request.headers) crawl_data.url = response.url self.session.add(crawl_data) self.session.commit() next_page = page.xpath('//link[@rel="next"]/@href').extract_first() if (next_page != None): yield response.follow(next_page, self.parse)
time.sleep(2) try: cookiesBtnElem = driver.find_element_by_xpath("//button[text()='Accetta']") driver.execute_script("arguments[0].click()", cookiesBtnElem) time.sleep(1) except: pass while True: pageCntr += 1 html = driver.page_source respObj = Selector(text=html) #if pageCntr > 27: cards = respObj.xpath("//div[@data-list-type='Catalog']/div[@id]") for card in cards: urlList.append(card.xpath(".//a[contains(@id, 'app_lnk')]/@href").get()) nextPageType1 = respObj.xpath(f"//a[@data-page and text()='{pageCntr}']") nextPageType2 = respObj.xpath(f"//span[contains(@class, 'pagination') and text()='{pageCntr}']") if nextPageType1: nextBtnElem = driver.find_element_by_xpath(f"//a[@data-page and text()='{pageCntr}']") driver.execute_script("arguments[0].click()", nextBtnElem) time.sleep(2) print(f"\n\n PAGE-{pageCntr}") elif nextPageType2: nextBtnElem = driver.find_element_by_xpath(f"//span[contains(@class, 'pagination') and text()='{pageCntr}']") driver.execute_script("arguments[0].click()", nextBtnElem) time.sleep(2)
def _all_principal_td(self): page_selector = Selector(text=self._content) return page_selector.xpath( '//td[starts-with(@headers, "LINK BREAK_COUNTRY_NAME")]')
# 可利用Beautiful Soup、pyquery及正则表达式来提取网页数据 # Scrapy提供了自己的数据提取方法:Selector(选择器).基于lxml构建,支持XPath选择器、CSS选择器就正则,解析速度和准确度非常高 # 1.直接使用:独立模块,可直接利用Selector类构建一个选择器对象,调用相关方法如xpath、css来提取数据 # 针对一段HTML,用如下方式狗结案Selector对象提取数据: from scrapy import Selector body = '<html><head><title>Hello World</title></head><body></body></html>' selector = Selector(text=body) title = selector.xpath('//title/text()').extract_first() # 查找title中的文本,XPath选择器最后加text方法发可实现文本提取 print(title) # 没有在Scrapy框架中运行,把Scrapy中的Selector单独拿出来使用,构建时传入text参数,生成了Selector选择器对象,像Scrapy中的解析 # 方式一样,调用xpath、css方法来提取。 # 2.Scrapy shell:Selector主要与Scrapy结合使用,Scrapy的回调函数中response直接调用xpath或者css方法提取数据, # 所以借助Scrapy shell模拟Scrapy请求过程,理解相关提取方法 # 用官方文档样例页面:http://doc.scrapy.org/en/latest/_static/selectors-sample1.html # 开启Srapy shell,命令行输入: scrapy shell http://doc.scrapy.org/en/latest/_static/selectors-sample1.html # 进入到Scrapy shell模式。过程是,Scrapy发起一次请求,请求的URL是命令行下输入的URL,把可操作的变量request、response传递给我 # 可在命令行模式下输入命令调用对象的一些操作方法,回车后实时显示结果。 # 演示实例都将页面的源码作为分析目标,源码: <html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id = 'images'> <a href='imgae1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> <a href='imgae1.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='imgae1.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
def parse(self, response): """ 解析 """ sel = Selector(text=response.body) print len(sel.xpath(u"//b[text()='单位名称']"))!= 0, "parse 条件" log.msg("parse 条件=%s"%str(len(sel.xpath(u"//b[text()='单位名称']")) != 0), level=log.INFO) if (len(sel.xpath(u"//b[text()='单位名称']")) != 0): #判别是否为要输入验证码 pass else: log.msg("code=%s, %s"%(str(response.status),response.body), level=log.INFO) raise UnknownResponseError #======================================================== """ 第一部分:企业信用档案 """ item = DetailInformation() item['basic_info'] = fundation_info_extract(response) #======================================================== #======================================================== """ 第一部分 政府监管信息 """ item['regulatory_info'] = extract_combine_JCXX(response) #======================================================== #======================================================== """ 第三部分 行业评价信息 """ keywords_list = ['2-1.体系/产品/行业认证信息', '2-2.行业协会(社会组织)评价信息',\ '2-3.水电气通讯等公共事业单位评价'] item['envaluated_info'] = block_info_extract(response,\ keywords_list) #======================================================== """ 第四部分 媒体评价信息 """ keywords_list = ['3-1.媒体评价信息'] item['media_env'] = block_info_extract(response, keywords_list) #======================================================== """ 第五部分 金融信贷信息 """ #url = 'http://www.11315.com/\ #getTradeLendingCount?companyId=%s'%response.url[7:15] #header = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36", # 'Referer':response.url} #req = urllib2.Request(url=url, headers=header) #xtml = urllib2.urlopen(req) #Nums = xtml.read() #print Nums, "this is Nums" #Nums = eval(Nums).split(",") #print Nums, "this is anothor Nums" #total = str(sum([int(i) for i in Nums])) #Nums.insert(0, total) #在头部插入 #if total == '0': # t_url = "" #else: # t_url = sel.xpath(u"//script").re(ur"html\(\'<a href=\"([\w\W]*?)\"")[0] #Nums.append(t_url) #Nums_re = "|".join(Nums) keywords_list = ['4-2.民间借贷评价信息'] item["credit_fin"] = block_info_extract(response, keywords_list) #======================================================= """ 第六部分 企业运营信息 """ #keywords_list = ['5-3.水电煤气电话费信息', #'5-4.纳税信息'] #要么运行js,要么模拟请求,破网站,就两行数据至于吗 #item['operation_info'] = block_info_extract(response, keywords_list) #======================================================== """ 第七部分 市场反馈信息 """ keywords_list = ['6-1.消费者评价信息', '6-2.企业之间履约评价','6-3.员工评价信息', '6-4.其他'] item['feedback_info'] = block_info_extract(response, keywords_list) #======================================================== return item
def main(): adsl = ADSL() result = [] df_input = pd.read_excel('sku.xlsx') sku_list = df_input['sku'].values start = 0 length = len(sku_list) while 1: if start == length: break print('正在爬取第{}条'.format(start + 1)) sku = sku_list[start] options = webdriver.ChromeOptions() options.add_argument( '--user-agent=Mozilla/5.0 (Windows NT 999999.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' ) options.add_argument('--headless') options.add_argument('--ignore-certificate-errors') options.add_argument('--disable-gpu') driver = webdriver.Chrome(executable_path=r'chromedriver.exe', chrome_options=options) wait = WebDriverWait(driver, TIMEOUT) # 等待加载最长时间 url = 'https://item.jd.com/{}.html'.format(sku) try: driver.get(url) except Exception as e: print(e) start += 1 continue try: wait.until( EC.presence_of_element_located( (By.XPATH, '//a[@id="InitCartUrl"]'))) except: print('访问超时,重试') start += 1 continue text = driver.page_source resp = Selector(text=text) title = resp.xpath('//div[@class="sku-name"]/text()').extract() if len(title) > 1: title = title[1].strip() else: title = title[0].strip() price = resp.xpath( '//span[@class="p-price"]/span[2]/text()').extract_first() comment = resp.xpath( '//div[@id="comment-count"]/a/text()').extract_first() try: activity_type = resp.xpath( '//div[@class="activity-type"]/strong/text()').extract_first() except: activity_type = None area = resp.xpath( '//div[@class="ui-area-text"]/text()').extract_first() store = resp.xpath( '//div[@id="store-prompt"]/strong/text()').extract_first() d = {} d['title'] = title d['price'] = price d['comment'] = comment d['activity_type'] = activity_type d['area'] = area d['store'] = store d['sku'] = str(sku) d['url'] = url result.append(d) time.sleep(2 * random.randint(2, 6)) driver.close() start += 1 adsl.reconnect() df = pd.DataFrame(result) df.to_csv(output_filename, encoding='gbk', mode='a', header=False) print('爬取结束,共爬取了{}条'.format(length))
# -*- coding: utf-8 -*- from scrapy import Selector import requests response = requests.get("https://www.baidu.com").text select = Selector(text=response) title = select.xpath("//title/text()").extract_first() print(title)
def parse(self, response): driver = response.meta['driver'] for _, value in self.df.iterrows(): driver.get(value['url']) time.sleep(2) html = driver.page_source resp_obj = Selector(text=html) check1 = resp_obj.xpath("//div[@data-type='items']") check2 = resp_obj.xpath( "//span[text()='Shop by Category' or text()='Shop by category']/parent::span/parent::button/following-sibling::div/div/ul/li" ) check3 = resp_obj.xpath( "//h2[text()='Shop by category']/parent::div/parent::div/following-sibling::div//div[@class='TempoCategoryTile-tile valign-top']" ) if check1: cntr = 1 while True: html = driver.page_source resp_obj = Selector(text=html) listings = resp_obj.xpath("//div[@data-type='items']") for prods in listings: product_url = f'''https://www.walmart.com{prods.xpath(".//div[@class='search-result-product-title gridview']/a/@href").get()}''' product_name = prods.xpath( "normalize-space(.//div[@class='search-result-product-title gridview']/a/span/text())" ).get() price = prods.xpath( "normalize-space(.//span[@class='price-main-block']/span/span/text())" ).get() if not product_name: product_url = f'''https://www.walmart.com{prods.xpath(".//span[text()='Product Title']/parent::div/a/@href").get()}''' product_name = prods.xpath( "normalize-space(.//span[text()='Product Title']/parent::div/a/span/text())" ).get() if not price: price = f'''{prods.xpath("normalize-space(.//span[@class='price price-main'][1]/span/text())").get()} - {prods.xpath("normalize-space(.//span[@class='price price-main'][2]/span/text())").get()}''' yield { 'product_url': product_url, 'product_name': product_name, 'product_price': price, 'lvl1_cat': value['lvl1_cat'], 'lvl2_cat': value['lvl2_cat'], 'lvl3_cat': value['lvl3_cat'], 'lvl4_cat': None } next_page = resp_obj.xpath( "//span[text()='Next Page']/parent::button") cntr += 1 if next_page: next_page = resp_obj.xpath( f"//ul[@class='paginator-list']/li/a[text()='{cntr}']/@href" ).get() driver.get(f"https://www.walmart.com{next_page}") time.sleep(2) else: break elif check2: driver.execute_script("window.open('');") driver.switch_to.window(driver.window_handles[1]) for listings in check2: lvl4_cat = listings.xpath(".//a/span/text()").get() url = listings.xpath(".//a/@href").get() driver.get(f"https://www.walmart.com{url}") cntr = 1 while True: html = driver.page_source resp_obj = Selector(text=html) listings = resp_obj.xpath("//div[@data-type='items']") for prods in listings: product_url = f'''https://www.walmart.com{prods.xpath(".//div[@class='search-result-product-title gridview']/a/@href").get()}''' product_name = prods.xpath( "normalize-space(.//div[@class='search-result-product-title gridview']/a/span/text())" ).get() price = prods.xpath( "normalize-space(.//span[@class='price-main-block']/span/span/text())" ).get() if not product_name: product_url = f'''https://www.walmart.com{prods.xpath(".//span[text()='Product Title']/parent::div/a/@href").get()}''' product_name = prods.xpath( "normalize-space(.//span[text()='Product Title']/parent::div/a/span/text())" ).get() if not price: price = f'''{prods.xpath("normalize-space(.//span[@class='price price-main'][1]/span/text())").get()} - {prods.xpath("normalize-space(.//span[@class='price price-main'][2]/span/text())").get()}''' yield { 'product_url': product_url, 'product_name': product_name, 'product_price': price, 'lvl1_cat': value['lvl1_cat'], 'lvl2_cat': value['lvl2_cat'], 'lvl3_cat': value['lvl3_cat'], 'lvl4_cat': lvl4_cat } next_page = resp_obj.xpath( "//span[text()='Next Page']/parent::button") cntr += 1 if next_page: next_page = resp_obj.xpath( f"//ul[@class='paginator-list']/li/a[text()='{cntr}']/@href" ).get() driver.get(f"https://www.walmart.com{next_page}") time.sleep(2) else: break driver.close() driver.switch_to.window(driver.window_handles[0]) elif check3: driver.execute_script("window.open('');") driver.switch_to.window(driver.window_handles[1]) for listings in check3: lvl4_cat = listings.xpath(".//span/text()").get() url = listings.xpath(".//following-sibling::a/@href").get() driver.get(f"https://www.walmart.com{url}") cntr = 1 while True: html = driver.page_source resp_obj = Selector(text=html) listings = resp_obj.xpath("//div[@data-type='items']") for prods in listings: product_url = f'''https://www.walmart.com{prods.xpath(".//div[@class='search-result-product-title gridview']/a/@href").get()}''' product_name = prods.xpath( "normalize-space(.//div[@class='search-result-product-title gridview']/a/span/text())" ).get() price = prods.xpath( "normalize-space(.//span[@class='price-main-block']/span/span/text())" ).get() if not product_name: product_url = f'''https://www.walmart.com{prods.xpath(".//span[text()='Product Title']/parent::div/a/@href").get()}''' product_name = prods.xpath( "normalize-space(.//span[text()='Product Title']/parent::div/a/span/text())" ).get() if not price: price = f'''{prods.xpath("normalize-space(.//span[@class='price price-main'][1]/span/text())").get()} - {prods.xpath("normalize-space(.//span[@class='price price-main'][2]/span/text())").get()}''' yield { 'product_url': product_url, 'product_name': product_name, 'product_price': price, 'lvl1_cat': value['lvl1_cat'], 'lvl2_cat': value['lvl2_cat'], 'lvl3_cat': value['lvl3_cat'], 'lvl4_cat': lvl4_cat } next_page = resp_obj.xpath( "//span[text()='Next Page']/parent::button") cntr += 1 if next_page: next_page = resp_obj.xpath( f"//ul[@class='paginator-list']/li/a[text()='{cntr}']/@href" ).get() driver.get(f"https://www.walmart.com{next_page}") time.sleep(2) else: break driver.close() driver.switch_to.window(driver.window_handles[0]) else: pass
def amica(report_label, product, model): from globals import file_path if product[7].startswith('http'): page_address = product[7] driver.get(product[7]) html = requests.get(product[7]).content sel = Selector(text=html) else: search = product[1][product[1].lower().find('amica') + len('amica') + 1:] amica_link = f'https://www.amica.pl/szukaj/{search}' driver.get(amica_link) html = requests.get(amica_link).content sel = Selector(text=html) # Znajdź model na stronie Amica try: for i in range(len(sel.xpath('//div[@class="container"]'))): if driver.find_element_by_xpath( f'//h3[@class="prodSymbol"][{i + 1}]').text == model: page_address = driver.find_element_by_xpath( f'//h3[@class="prodSymbol"][{i + 1}]/a').get_attribute( 'href') break except NoSuchElementException: report_label[ 'text'] += f"Nie znaleziono {model} na stronie Amica. Pomijam go." return -1 driver.find_element_by_css_selector( '#produkty > div.moreProducts > div > div > div > div > div > div > div.image > a' ).click() sleep(1) driver.find_element_by_css_selector( '#menu01 > div > div.product-view__media > img').click() first = driver.find_element_by_css_selector( '#prod_app > div.medialightbox__overlay > div > div.cool-lightbox__inner > div.cool-lightbox__wrapper > ' 'div > div > img').get_attribute('src') # Zapisywanie i obrabianie zdjęc do miniaturek i = 0 while i < 15: if i == 0: res = requests.get(first) else: desc_img = driver.find_element_by_css_selector( '#prod_app > div.medialightbox__overlay > div > div.cool-lightbox__inner > div.cool-lightbox__wrapper ' '> div > div > img').get_attribute('src') if desc_img == first: break res = requests.get(desc_img) with open(f'{file_path}/{model}/obrazki_produktu/{i}.jpg', 'wb') as file_format: file_format.write(res.content) try: driver.find_element_by_xpath( '//*[@id="prod_app"]/div[4]/div/div[2]/div[2]/button[2]/div' ).click() except ElementNotInteractableException: pass sleep(1) i = i + 1 for y in range(i): im = Image.open(f'{file_path}/{model}/obrazki_produktu/{y}.jpg') file_format = im.format width, height = im.size if width > height: ratio = width / 600 else: ratio = height / 600 new_width = round(width / ratio) new_height = round(height / ratio) im = im.resize((new_width, new_height)) if file_format == 'PNG': im.save(f'{file_path}/{model}/obrazki_produktu/{y}.jpg', 'PNG') elif file_format == 'JPEG': im.save(f'{file_path}/{model}/obrazki_produktu/{y}.jpg', 'JPEG') else: print(f"Nie umiem zrobić zdjęcia nr {y} :'( (typ {file_format})") driver.find_element_by_tag_name('body').send_keys(Keys.ESCAPE) html = requests.get(page_address).content sel = Selector(text=html) raw = sel.xpath('/html/body/div[1]/script[4]/text()').extract() for i in range(len(raw)): raw[i] = raw[i].replace('\n', '') raw[i] = raw[i].replace('\t', '') raw[i] = raw[i].replace('\xa0', '') raw[i] = raw[i].replace('\r', '') raw[i] = raw[i].replace(' ', '') t = raw[0] t = t[t.find('"descTitle":'):] t = t[:t.find('}]}')] desc = [] imgs = [] while t.find('"descTitle":') != -1: t = t[t.find('"descTitle":') + 13:] desc.append(t[:t.find('"')]) t = t[t.find('"descIconUrl":') + 15:] imgs.append(t[:t.find('"')]) t = t[t.find('"descText":') + 12:] desc.append(t[:t.find('"')]) for i in range(len(imgs)): imgs[i] = imgs[i].replace('\\', '') # pobieranie zdjęć z opisu na dysk lokalny for i, img in enumerate(imgs): res = requests.get(img) with open(f'{file_path}/{model}/obrazki_opisu/{i}.jpg', 'wb') as file_format: file_format.write(res.content) for i in range(len(desc)): desc[i] = desc[i].replace('\\u0105', 'ą') desc[i] = desc[i].replace('\\u0119', 'ę') desc[i] = desc[i].replace('\\u0107', 'ć') desc[i] = desc[i].replace('\\u0144', 'ń') desc[i] = desc[i].replace('\\u015b', 'ś') desc[i] = desc[i].replace('\\u015a', 'Ś') desc[i] = desc[i].replace('\\u00f3', 'ó') desc[i] = desc[i].replace('\\u0141', 'Ł') desc[i] = desc[i].replace('\\u0142', 'ł') desc[i] = desc[i].replace('\\u017a', 'ź') desc[i] = desc[i].replace('\\u017b', 'Ż') desc[i] = desc[i].replace('\\u017c', 'ż') desc[i] = desc[i].replace('\\u017', 'Ź') desc[i] = desc[i].replace('\\u00ae', '®') desc[i] = desc[i].replace('\\u00b0', '°') desc[i] = desc[i].replace('\u00b0', '°') desc[i] = desc[i].replace('\u2070', '°') desc[i] = desc[i].replace('\\u2070', '°') desc[i] = desc[i].replace('\\u2013', '-') desc[i] = desc[i].replace('\u2013', '-') desc[i] = desc[i].replace('\\u2026', '...') desc[i] = desc[i].replace('\u2026', '...') desc[i] = desc[i].replace('\\n', '') desc[i] = desc[i].replace('\\/', '/') j = 0 fin = ['<div class="product-description-section">'] for i in range(0, len(desc), 6): fin.append('<div class="three-col-equaly">') try: fin.append( f'<div><img src="https://matrixmedia.pl/media/wysiwyg/Amica/' f'{model}/{j}.jpg"/><br/><h2 class="important-header">{desc[i]}</h2>' ) fin.append(f'<p style="font-size: large;">{desc[i + 1]}</p></div>') fin.append( f'<div><img src="https://matrixmedia.pl/media/wysiwyg/Amica/' f'{model}/{j + 1}.jpg"/><br/><h2 class="important-header"> {desc[i + 2]}</h2>' ) fin.append(f'<p style="font-size: large;">{desc[i + 3]}</p></div>') fin.append( f'<div><img src="https://matrixmedia.pl/media/wysiwyg/Amica/' f'{model}/{j + 2}.jpg"/><br/><h2 class="important-header"> {desc[i + 4]}</h2>' ) fin.append(f'<p style="font-size: large;">{desc[i + 5]}</p></div>') except IndexError: pass finally: fin.append('</div>') j = j + 3 fin.append('</div>') reg = ''.join(fin) reg = reg.replace( '*Zdjęcie ma charakter poglądowy i może nie przedstawiać dokładnego modelu produktu.', '') print("------------ OPIS GRAFICZNY ------------") print(reg + '\n\n') """ OPIS TECHNICZNY """ html = requests.get(page_address).content sel = Selector(text=html) tech_raw = sel.xpath('/html/body/div[1]/script[4]/text()').extract() tech_raw2 = tech_raw[0] tech_d = tech_raw2[tech_raw2.find('"attrGroupData"'):tech_raw2. find('"docFilesDataList"')] tech_desc_1 = [] while tech_d.find('"attrName":') != -1: tech_d = tech_d[tech_d.find('"attrName":') + 12:] tech_desc_1.append(tech_d[:tech_d.find('"')]) tech_d = tech_d[tech_d.find('"attrValue":') + 13:] tech_desc_1.append(tech_d[:tech_d.find('"')]) tech_d2 = tech_d[tech_d.find(tech_desc_1[-1]):] tech_desc_2 = [] while tech_d2.find('"attrValue":') != -1: tech_d2 = tech_d2[tech_d2.find('"attrValue":') + 13:] tech_desc_2.append(tech_d2[:tech_d2.find('"')]) tech_desc = [ '<table id="plan_b" class="data-table"><tbody><tr class="specs_category"><td ' 'colspan="2">Specyfikacja</td></tr>' ] for i in range(0, len(tech_desc_1), 2): tech_desc.append(f'<tr><td class="c_left">{tech_desc_1[i]}</td>') tech_desc.append(f'<td class="c_left">{tech_desc_1[i + 1]}</td></tr>') for i in range(len(tech_desc_2)): if i == 0: tech_desc.append(f'<tr><td class="c_left">Funkcje</td>') tech_desc.append(f'<td class="c_left">{tech_desc_2[i]}</td></tr>') else: tech_desc.append(f'<tr><td class="c_left"></td>') tech_desc.append(f'<td class="c_left">{tech_desc_2[i]}</td></tr>') tech_desc.append('</tbody></table>') for i in range(len(tech_desc)): tech_desc[i] = tech_desc[i].replace('\\u0105', 'ą') tech_desc[i] = tech_desc[i].replace('\\u0119', 'ę') tech_desc[i] = tech_desc[i].replace('\\u0107', 'ć') tech_desc[i] = tech_desc[i].replace('\\u0144', 'ń') tech_desc[i] = tech_desc[i].replace('\\u015b', 'ś') tech_desc[i] = tech_desc[i].replace('\\u015a', 'Ś') tech_desc[i] = tech_desc[i].replace('\\u00f3', 'ó') tech_desc[i] = tech_desc[i].replace('\\u0141', 'Ł') tech_desc[i] = tech_desc[i].replace('\\u0142', 'ł') tech_desc[i] = tech_desc[i].replace('\\u017a', 'ź') tech_desc[i] = tech_desc[i].replace('\\u017b', 'Ż') tech_desc[i] = tech_desc[i].replace('\\u017c', 'ż') tech_desc[i] = tech_desc[i].replace('\\u017', 'Ź') tech_desc[i] = tech_desc[i].replace('\\u00ae', '®') tech_desc[i] = tech_desc[i].replace('\\u00b0', '°') tech_desc[i] = tech_desc[i].replace('\u00b0', '°') tech_desc[i] = tech_desc[i].replace('\u2070', '°') tech_desc[i] = tech_desc[i].replace('\\u2070', '°') tech_desc[i] = tech_desc[i].replace('\\u2013', '-') tech_desc[i] = tech_desc[i].replace('\u2013', '-') tech_desc[i] = tech_desc[i].replace('\\u2026', '...') tech_desc[i] = tech_desc[i].replace('\u2026', '...') tech_desc[i] = tech_desc[i].replace('\\n', '') tech_desc[i] = tech_desc[i].replace('\\/', '/') tech_desc[i] = tech_desc[i].replace(':', '') tech = ''.join(tech_desc) print('------------ OPIS TECHNICZNY ------------') print(tech + '\n\n') """ OPIS KRÓTKI """ for i in range(len(tech_desc_1)): tech_desc_1[i] = tech_desc_1[i].replace('\\u0105', 'ą') tech_desc_1[i] = tech_desc_1[i].replace('\\u0119', 'ę') tech_desc_1[i] = tech_desc_1[i].replace('\\u0107', 'ć') tech_desc_1[i] = tech_desc_1[i].replace('\\u0144', 'ń') tech_desc_1[i] = tech_desc_1[i].replace('\\u015b', 'ś') tech_desc_1[i] = tech_desc_1[i].replace('\\u015a', 'Ś') tech_desc_1[i] = tech_desc_1[i].replace('\\u00f3', 'ó') tech_desc_1[i] = tech_desc_1[i].replace('\\u0141', 'Ł') tech_desc_1[i] = tech_desc_1[i].replace('\\u0142', 'ł') tech_desc_1[i] = tech_desc_1[i].replace('\\u017a', 'ź') tech_desc_1[i] = tech_desc_1[i].replace('\\u017b', 'Ż') tech_desc_1[i] = tech_desc_1[i].replace('\\u017c', 'ż') tech_desc_1[i] = tech_desc_1[i].replace('\\u017', 'Ź') tech_desc_1[i] = tech_desc_1[i].replace('\\u00ae', '®') tech_desc_1[i] = tech_desc_1[i].replace('\\u00b0', '°') tech_desc_1[i] = tech_desc_1[i].replace('\u00b0', '°') tech_desc_1[i] = tech_desc_1[i].replace('\u2070', '°') tech_desc_1[i] = tech_desc_1[i].replace('\\u2070', '°') tech_desc_1[i] = tech_desc_1[i].replace('\\u2013', '-') tech_desc_1[i] = tech_desc_1[i].replace('\u2013', '-') tech_desc_1[i] = tech_desc_1[i].replace('\\u2026', '...') tech_desc_1[i] = tech_desc_1[i].replace('\u2026', '...') tech_desc_1[i] = tech_desc_1[i].replace('\\n', '') tech_desc_1[i] = tech_desc_1[i].replace('\\/', '/') tech_desc_1[i] = tech_desc_1[i].replace(':', '') if len(tech_desc_1) < 12: n = len(tech_desc_1) else: n = 12 short = ['<ul>'] for i in range(0, n, 2): short.append(f'<li>{tech_desc_1[i]}: {tech_desc_1[i + 1]}</li>') short.append('</ul>') short = '\n'.join(short) print('------------ OPIS KRÓTKI ------------') print(short + '\n\n') return [reg, short, tech]
def preview_result(Xpath, inputtext): sel = Selector(text=inputtext) result = sel.xpath(Xpath).extract() n = len(result) for idx, element in enumerate(result[:min(4,n)], start=1): print(f"Element {idx}: {element}", end=sp) sp = '\n\n' url = 'https://www.cdc.gov/nchs/tutorials/NHANES/index_continuous.htm' # res = requests.get(url) # html = res.text html = requests.get(url).content xpath = '//p' xpath2 = '//*' sel = Selector(text=html) sll = sel.xpath('//p')[2].extract() # extract the 3rd element (here paragrph) of the selectorList sll_ = sel.xpath('//p') # without extract(), the selectorList give a 36 line preview of the paragraph slla = sel.xpath('//p').extract() sllf = sel.xpath('//p').extract_first() # print(sll, slla, sllf, sep=sp) print(number_of_element(xpath, html), number_of_element(xpath2, html),preview_result(xpath, html), sep=sp)
def parse_detail(self, response): try: # 数据获取不全 data = Selector(text=response.body.decode('gbk')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') # 共有字段 fileTitle = data.xpath( '//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()' ).extract_first() # 正文标题 textTitle = data.xpath( '//td[@class="fh vat bw f8-0 b1"]/table[1]//tr[1]/td[@align="center"]/text()' ).extract_first() supllyType = response.meta.get('supllyType').strip() administration = response.meta.get('administration').strip() supplyNoticeTitle = response.meta.get('supplyNoticeTitle').strip() publishTime = response.meta.get('publishTime').strip() projectName = '' parcelNumber = '' parcelLocation = '' landPurpose = '' landArea = '' transferTimeLimit = '' transferPrice = '' landPurposeDetail = '' transferUnit = '' remark = '' publicityPeriod = '' contactUnit = '' unitAddr = '' postalCode = '' contactTel = '' contacter = '' email = '' lanServiceCondition = '' # 公告类型 # noticeType = # 公示期 publicityPeriod = reFunction(u'公示期:([\s\S]*)三、', reFunction('四、[\s\S]*', items)).strip() # 联系单位 contactUnit = reFunction(u'联系单位:([\s\S]*)单位地址', reFunction('四、[\s\S]*', items)).strip() # 单位地址 unitAddr = reFunction(u'单位地址:([\s\S]*)邮政编码', reFunction('四、[\s\S]*', items)).strip() # 邮政编码 postalCode = reFunction(u'邮政编码:([\s\S]*)联系电话', reFunction('四、[\s\S]*', items)).strip() # 联系电话 contactTel = reFunction(u'联系电话:([\s\S]*)联 系 人', reFunction('四、[\s\S]*', items)).strip() # 联系人 contacter = reFunction(u'联 系 人:([\s\S]*)电子邮件', reFunction('四、[\s\S]*', items)).strip() # 电子邮件 email = reFunction(u'电子邮件:([\w\.\@]*)(?:[\S]*)', reFunction('四、[\s\S]*', items)).strip() if '宗地编号' in items: for item in [ '宗地编号' + _ for _ in re.findall('([\s\S]*)二、', items) [0].split('宗地编号')[1:] ]: # 宗地编号 parcelNumber = reFunction('宗地编号:(?:\s*)([\s\S]*)地块位置', item).strip() # 地块位置 parcelArea parcelLocation = reFunction('地块位置:(?:\s*)([\s\S]*)土地用途:', item).strip() # 土地用途 landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)土地面积\(公顷\)', item).strip() # 土地面积(公顷) landArea = reFunction( '土地面积\(公顷\):(?:\s*)([\w}/\.{]*)(?:\s*)', item).strip() # 项目名称 projectName = reFunction('项目名称:(?:\s*)([\s\S]*)土地用途明细', item).strip() # 出让年限 transferTimeLimit = reFunction( '出让年限:(?:\s*)([\s\S]*)成交价\(万元\)', item).strip() # 成交价(万元) transferPrice = reFunction( '成交价\(万元\):(?:\s*)([\s\S]*)土地用途明细', item).strip() # 土地用途明细(用途名称、面积) landPurposeDetail = reFunction( '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位', item).strip() if reFunction( '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位', item).strip() else reFunction( '(?:\s*)([\d\.]*)(?:[\s]*)受让单位', item).strip() # 受让单位 transferUnit = reFunction('受让单位:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 土地使用条件 lanServiceCondition = reFunction( '土地使用条件:(?:\s*)([\s\S]*)备注', item).strip() # 备注 # remark = reFunction(u'备注:(?:\s*)([\w}/,、\u4e00-\uffe5()《》:\-\.<≤。{\u3002\uff1f\uff01\uff0c\u3001\uff1b\uff1a\u201c\u201d\u2018\u2019\uff08\uff09\u300a\u300b\u3008\u3009\u3010\u3011\u300e\u300f\u300c\u300d\ufe43\ufe44\u3014\u3015\u2026\u2014\uff5e\ufe4f\uffe5]*)(?:\s*)', item).strip() remark = reFunction(u'备注:(?:\s*)([\s\S]*)(?:\s*)[二、]?', item).strip() # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(parcelNumber + publishTime + parcelLocation + url) # 存储数据 csvFile = [ administration, supplyNoticeTitle, publishTime, fileTitle, textTitle, projectName, parcelNumber, parcelLocation, landPurpose, landArea, transferTimeLimit, transferPrice, landPurposeDetail, transferUnit, remark, publicityPeriod, contactUnit, unitAddr, postalCode, contactTel, contacter, email, lanServiceCondition, crawlingTime, url, md5Mark ] self.fileDetail.write(','.join([ _.replace(',', ' ').replace('\n', '').replace( '\r', '') if _ else _ for _ in csvFile ])) self.fileDetail.write('\n') yield #TODO elif '地块编号' in items: for item in [ '地块编号' + _ for _ in re.findall('([\s\S]*)二、', items) [0].split('地块编号')[1:] ]: # 地块编号 parcelNumber = reFunction('地块编号:(?:\s*)([\s\S]*)地块位置', item).strip() # 地块位置 parcelArea parcelLocation = reFunction('地块位置:(?:\s*)([\s\S]*)土地用途:', item).strip() # 土地用途 landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)土地面积\(公顷\)', item).strip() # 土地面积(公顷) landArea = reFunction( '土地面积\(公顷\):(?:\s*)([\w}/\.{]*)(?:\s*)', item).strip() # 项目名称 projectName = reFunction('项目名称:(?:\s*)([\s\S]*)土地用途明细', item).strip() # 出让年限 transferTimeLimit = reFunction( '出让年限:(?:\s*)([\s\S]*)成交价\(万元\)', item).strip() # 成交价(万元) transferPrice = reFunction( '成交价\(万元\):(?:\s*)([\s\S]*)土地用途明细', item).strip() # 土地用途明细(用途名称、面积) landPurposeDetail = reFunction( '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位', item).strip() if reFunction( '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位', item).strip() else reFunction( '(?:\s*)([\d\.]*)(?:[\s]*)受让单位', item).strip() # 受让单位 transferUnit = reFunction('受让单位:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 土地使用条件 lanServiceCondition = reFunction( '土地使用条件:(?:\s*)([\s\S]*)备注', item).strip() # 备注 remark = reFunction(u'备注:(?:\s*)([\s\S]*)(?:\s*)[二、]?', item).strip() # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(parcelNumber + publishTime + parcelLocation + url) # 存储数据 csvFile = [ administration, supplyNoticeTitle, publishTime, fileTitle, textTitle, projectName, parcelNumber, parcelLocation, landPurpose, landArea, transferTimeLimit, transferPrice, landPurposeDetail, transferUnit, remark, publicityPeriod, contactUnit, unitAddr, postalCode, contactTel, contacter, email, lanServiceCondition, crawlingTime, url, md5Mark ] self.fileDetail.write(','.join([ _.replace(',', ' ').replace('\n', '').replace( '\r', '') if _ else _ for _ in csvFile ])) self.fileDetail.write('\n') #TODO except Exception as e: self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
def preview_result(Xpath, inputtext): sel = Selector(text=inputtext) result = sel.xpath(Xpath).extract() n = len(result) for idx, element in enumerate(result[:min(4,n)], start=1): print(f"Element {idx}: {element}", end=sp)
def parse(self, response): pagesource = Selector(response) tax_rate = .01 interest = 0.0435 loan_term = 30 insurance = .5 dp_percentage = 0.25 total_page = re.findall( r"\d+", response.xpath('//span[@class="pageText"]//text()').extract() [0])[1] current_page = re.findall( r"\d+", response.xpath('//span[@class="pageText"]//text()').extract() [0])[0] search_results = pagesource.xpath( "//div[@class='MapHomeCardReact HomeCard']") for search in search_results: entry = RedfinTestItem() entry['price'] = float(''.join( re.findall( r"\d+", search.xpath( './/span[@data-rf-test-name="homecard-price"]//text()' ).extract()[0]))) entry['street'] = search.xpath( './/span[@data-rf-test-id="abp-streetLine"]//text()').extract( )[0] entry['citystatezip'] = search.xpath( './/span[@data-rf-test-id="abp-cityStateZip"]//text()' ).extract()[0] entry['zipcode'] = re.findall( r"\d+", search.xpath( './/span[@data-rf-test-id="abp-cityStateZip"]//text()'). extract()[0]) entry['HOA'] = ''.join( re.findall( r"\d+", search.xpath( './/span[@data-rf-test-name="homecard-amenities-hoa"]//text()' ).extract()[0])) entry['Beds'] = ''.join( search.xpath('.//div[@class="value"]//text()').extract()[0]) entry['Baths'] = ''.join( search.xpath('.//div[@class="value"]//text()').extract()[1]) entry['SQFT'] = ''.join( search.xpath('.//div[@class="value"]//text()').extract()[2]) entry['year_built'] = search.xpath( './/span[@data-rf-test-name="homecard-amenities-year-built"]//text()' ).extract()[0] entry['rent'] = get_rent(str(entry['street']), str(entry['zipcode'])) entry['mortgage_pmt'] = float( Loan(entry['price'] * 1 - (dp_percentage), interest, loan_term).monthly_payment) entry['insurance'] = insurance * make_float(entry['SQFT']) if entry['insurance'] == 0: entry['insurance'] == 60 entry['tax'] = entry['price'] * tax_rate / 12 entry['total_pmt'] = make_float( entry['HOA'] ) + entry['mortgage_pmt'] + entry['insurance'] + entry['tax'] entry['cashflow'] = get_cashflow(entry['rent'], entry['total_pmt']) #, entry['price_estimate'] yield entry if int(total_page) > int(current_page): if int(current_page) == 1: next_page = response.url + "/page-2" else: next_page = re.sub(r"[page-][\d]+", "-" + str(int(current_page) + 1), response.url) yield Request(next_page, callback=self.parse)
def parse(self, response): ''' Scrape archive for articles Parameters ---------- self: the PostillonSpider object response: The response from a scrapy request ''' def init_selenium_driver(): ''' Initialize and return a firefox or chorme selenium driver depending on the option SELENIUM_DRIVER Returns ------- A firefox or chrome selenium driver depending on the option SELENIUM_DRIVER ''' if SELENIUM_DRIVER == 'Firefox': firefoxOptions = webdriver.FirefoxOptions() firefoxOptions.headless = True desired_capabilities = firefoxOptions.to_capabilities() driver = webdriver.Firefox( desired_capabilities=desired_capabilities) else: # Chrome driver chrome_options = Options() chrome_options.headless = True driver = webdriver.Chrome(options=chrome_options) return driver def get_closed_elements(): ''' Returns all or some closed year and month elements, depending on the limit definitions. Returns ------- All or some closed year and month elements, depending on the limit definitions. ''' # Get all closed months of year to crawl, that are newer or equal to the limit specified by LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL if LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL: # get year element_of_YEAR_TO_CRAWL = driver.find_element_by_class_name( 'year-' + str(YEAR_TO_CRAWL)) # Get closed months xpath = ".//li[contains(@class, 'closed') and (contains(@class, 'month-12')" for month in range(LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL - 1, 12): month_plus_1 = month + 1 xpath += " or contains(@class, 'month-" + "{:02d}".format( month + 1) + "')" xpath = xpath + ")]" closed_elements = element_of_YEAR_TO_CRAWL.find_elements_by_xpath( xpath) closed_elements.append(element_of_YEAR_TO_CRAWL) # Get all closed months of year to crawl elif YEAR_TO_CRAWL: element_of_YEAR_TO_CRAWL = driver.find_element_by_class_name( 'year-' + str(YEAR_TO_CRAWL)) closed_elements = element_of_YEAR_TO_CRAWL.find_elements_by_class_name( 'closed') closed_elements.append(element_of_YEAR_TO_CRAWL) # Get all closed years/months of the entire archive else: # also finds closed months inside closed years closed_elements = driver.find_elements_by_class_name('closed') return closed_elements def waitForLoad(): ''' Wait until at 1 article per year has been loaded. If the current year is being crawled wait until an article of january or LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL has been loaded (Because the current month of the current year is already loaded on page load). ''' CURRENT_YEAR = datetime.now().year TIMEOUT = 20 wait = WebDriverWait(driver, TIMEOUT) try: # xpath for tag that with class 'date' and content that includes '2020' or '1.2020' or '<LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL>.2020', # depending on what is to be crawled xpath = "//a/div/div/div[contains(@class, 'date') and contains(string(), '" if YEAR_TO_CRAWL: # If the current year is crawled wait for an article of the first month to be loaded. # This is necessary because the current month is already loaded on page load. if YEAR_TO_CRAWL == CURRENT_YEAR: if LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL: xpath += str( LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL) + "." else: xpath += "1." xpath += str(YEAR_TO_CRAWL) + "')]" wait.until( EC.presence_of_element_located((By.XPATH, xpath))) # Wait for 1 artile per year else: base_xpath = xpath for i in range(2008, CURRENT_YEAR + 1): # xpath for tag with class 'date' and the content that includes the year i xpath = base_xpath + str(i) + "')]" wait.until( EC.presence_of_element_located((By.XPATH, xpath))) except TimeoutException as e: logging.warning( "TimeoutException has been thrown while waiting for articles to load: %s", e) def click_elements(elements): '''" Click all elements in elements Parameters ---------- elements: HTML Elements to be clicked ''' for element in elements: try: # element.click() causes Exception: "could not be scrolled into view" driver.execute_script("arguments[0].click();", element) # print("click: " + element.get_attribute('class').split()[1]) except Exception as e: logging.warning( "An exception has been thrown while clicking closed years/months: %s", e) driver = init_selenium_driver() driver.get(root) # Close all years/months click_elements(driver.find_elements_by_class_name('open')) # Open closed years/months to load articles click_elements(get_closed_elements()) # Wait for articles to be loaded waitForLoad() # Hand-off between Selenium and Scrapy sel = Selector(text=driver.page_source) # for all ul tags with class 'month-inner' get all contained li tags and get their direct a-tag children articleList = sel.xpath('//ul[@class="month-inner"]//li/a') articleList = utils.limit_crawl(articleList, TESTRUN_ARTICLES_LIMIT) if articleList: for article in articleList: # extract the value of the href attribute from article long_url = article.xpath('./@href').extract()[0] # extract the content of div-tags with class 'date' contained by article published_time = article.xpath( './/div[@class="date"]/text()').extract() published_time = published_time[0] if len( published_time) > 0 else '' if long_url and not utils.is_url_in_db(long_url): yield scrapy.Request(long_url, callback=self.parse_article, cb_kwargs=dict( long_url=long_url, published_time=published_time)) else: utils.log_event(utils(), self.name, long_url, 'exists', 'info') logging.info('%s already in db', long_url) # Quit the selenium driver and close every associated window driver.quit()
from scrapy import Selector body = " <html><head><title>Hello World</title></head><body></body> </ html> " selector = Selector(text=body) title = selector.xpath('//title/text ()').extract_first() print(title)
def parseNews(self, response): self.response_body_decode(response) sel = Selector(response) homeurl = tools.getHomeUrl(response.url) brandname = response.meta['brandname'] news = None # news保存新闻主体部分的SelectorList pagerule = None # 判断是否已经可以确定页面规则 if response.meta.has_key('pagerule'): pagerule = response.meta['pagerule'] news = sel.xpath(pagerule['pageform']) else: # 对于新闻页面规则库的每条规则进行匹配,然后对该类型的新闻页面进行爬取 for each_rule in newspage_type.page_rules: news = sel.xpath(each_rule['pageform']) if len(news) > 0: pagerule = each_rule break if pagerule is None: raise ValueError('Error processing (' + response.url + ') This page do not have corresponding rules') # 获得allpage 和 nextpage url if pagerule['allpage'] is None: allpage = [] else: allpage = news.xpath(pagerule['allpage']).extract() if pagerule['nextpage'] is None: nextpage = [] else: nextpage = news.xpath(pagerule['nextpage']).extract() # 如果包含全页阅读的url,则进行该处理 if len(allpage) > 0: if tools.isCompleteUrl(allpage[0]): url = allpage[0] else: url = homeurl + allpage[0] r = Request(url, callback=self.parseNews) r.meta['brandname'] = brandname r.meta['pagerule'] = pagerule yield r elif len(nextpage) > 0: # 如果包含下一页,则进行该处理 if tools.isCompleteUrl(nextpage[0]): url = nextpage[0] else: url = homeurl + nextpage[0] # 提取当前页面的title, date, content,保存到article中,传递至下一请求 title = news.xpath(pagerule['title']).extract() date = self.getDate(news, response.url, pagerule['date']) content = self.getContent(news, pagerule['content']) article = { 'brandname': brandname, 'title': title, 'date': date, 'content': content } r = Request(url, callback=self.parseNextPage) r.meta['article'] = article r.meta['pagerule'] = pagerule yield r else: # 如果新闻只有一页,则直接提取新闻内容 title = news.xpath(pagerule['title']).extract() date = self.getDate(news, response.url, pagerule['date']) content = self.getContent(news, pagerule['content']) item = NewsItem() item['brandname'] = brandname item['date'] = date item['title'] = "".join(title) item['content'] = "".join(content) yield item
def reviews_parse(self,response): hxs = Selector(response) # print 11111111 item = reviewsItem() sites = hxs.xpath('//*[@class="article"]/div[2]/div[@class="item"]/div[@class="info"]/ul') # sites = hxs.xpath('//*[@class="article"]/div[2]/div[@class="item"]/div[@class="info"]') for site in sites: item['userID'] = re.findall('people/(.+)/collect',response.url) # print response.url item['moviename'] = site.xpath('li[@class="title"]/a/em/text()').extract() item['movieID'] = site.xpath('li[@class="title"]/a/@href').re('subject/(.+)/$') moviesUrl =site.xpath('li[@class="title"]/a/@href').extract()[0] yield Request(url=moviesUrl,callback=self.movie_parse) item['ratingdate'] = site.xpath('li[3]/span[@class="date"]/text()').extract() if re.findall('rating\d+-t',site.xpath('li[3]/span[1]/@class').extract()[0]): item['rating'] = site.xpath('li[3]/span[1]/@class').re('\d+') else: item['rating'] = [u''] if site.xpath('li[4]/span[@class="comment"]/text()').extract(): item['comment'] = site.xpath('li[4]/span[@class="comment"]/text()').extract() else: item['comment'] = [u''] yield item # print item if hxs.xpath('//*[@class="paginator"]/span[@class="next"]/a/@href').extract(): nextreviewsUrl = hxs.xpath('//*[@class="paginator"]/span[@class="next"]/a/@href').extract()[0] # print nextreviewsUrl yield Request(url=nextreviewsUrl, callback=self.reviews_parse) pass
def on_detail_page(self, response): if response.url == response.old_url: try: hxs = Selector(text=response.content) summary = hxs.xpath('//div[@class="card-summary-content"]/*').extract() content = [] for ctx in summary: text = clean_html_text(ctx) content.append(text) content_text = " ".join(content) content_text=content_text.replace("[1]","") content_text=content_text.replace("[2]","") item_dict={} items = hxs.xpath('//div[@class="baseInfoWrap"]/div/div/*') for item in items: title = item.xpath('./span/text()').extract() title_value = item.xpath('./div/text()').extract() print("key:value", to_value(title), to_value(title_value)) item_dict[to_value(title)] = to_value(title_value) item_dict['summary'] = content_text imgs = hxs.xpath('//div[@class="lemma-picture summary-pic"]/a/img/@src').extract() item_dict['logo'] = to_value(imgs) print(item_dict) # save_content(self.site.name, url, json.dumps(item_dict)) # update_url(self.site.name, url, 200) return item_dict except Exception,e: # update_url(self.site.name, url, 500) logging.error(e)