def parse_company(self, response): response_selector = HtmlXPathSelector(response) list = response_selector.select(self.companylist_xpath) #print list for sel in list: item = ParkCompanyItem() company_name = sel.select(self.companyname_xpath).extract()[0] #print company_name # print response.meta["park_md5"] item['company_name'] = company_name item["park_md5"] = response.meta["park_md5"] yield item #��һҳ page = response.meta['page'] #��һҳ��ԭ������ҳ����ͬ if len(response_selector.select(self.nextpage_xpath).extract()) > 0: nextMsg = response_selector.select( self.nextpage_xpath).extract()[0] currentPage = nextMsg.split(",")[0].split("(")[1] if cmp(str(page), str(currentPage)) == 0: if page == 1: url = response.url + "?page=" + str(page + 1) else: url = response.url.split("=")[0] + "=" + str(page + 1) print url yield scrapy.Request(url=url, callback=self.parse_company, meta={ "park_md5": item["park_md5"], "page": page + 1 }, headers=DEFAULT_REQUEST_HEADERS)
def parse_home(self, response): response_selector = HtmlXPathSelector(response) field_value = {} field_value['url'] = response.url field_value['sub_url'] = response.meta["sub_url"] for field_name in self.home_field_xpath: field_value[field_name] = "" if len( response_selector.select( self.home_field_xpath[field_name]).extract()) > 0: if field_name == "industry": #���Ϊ��ҵ����Ҫ�÷���ƴ��������Ϊ�ַ��� industries = response_selector.select( self.home_field_xpath[field_name]).extract() field_value[field_name] = ",".join(industries) else: field_value[field_name] = response_selector.select( self.home_field_xpath[field_name]).extract()[0] #ƴ��ͼƬ��ַ field_value["image"] = "http://" + response.url.split( "/")[2] + field_value["image"] yield scrapy.Request(url=response.url + "detail/", callback=self.parse_detail, meta=field_value, headers=DEFAULT_REQUEST_HEADERS)
def parse(self, response): #print response.url response_selector = HtmlXPathSelector(response) list = response_selector.select(self.list_xpath) #���û��page�����ǵ�һ����ȡ��Ϊ��һҳ try: page = response.meta['page'] except: page = 1 #�������� for sel in list: home = sel.select(self.home_xpath).extract()[0] home_link = home[4:] #print home_link yield scrapy.Request(url=home_link, callback=self.parse_home, meta={"sub_url": response.url}, headers=DEFAULT_REQUEST_HEADERS) #������һҳ��Ϣ���ַ�������PageList(1,10,10,104,'gyyclass=0&keyword=&CY_id=0&city=0',8) nextMsg = response_selector.select(self.nextpage_xpath).extract()[0] # �����ַ������õ���ǰҳ�� currentPage = nextMsg.split(",")[0].split("(")[1] #�����ǰҳ���meta�е�ҳ����ȣ���ȥ��ȡ��һҳ�����Ͳ�������һҳ���������pageΪ12ʱ�����û�е�12ҳ�����������currentPage��Ϊ11����˵����ǰ�Ѿ��������ҳ if cmp(str(page), str(currentPage)) == 0: if page == 1: url = response.url + "?page=" + str(page + 1) else: url = response.url.split("=")[0] + "=" + str(page + 1) # ������һҳ yield scrapy.Request(url=url, callback=self.parse, meta={"page": page + 1}, headers=DEFAULT_REQUEST_HEADERS)
def parse_torrent(self, response): x = HtmlXPathSelector(response) torrent = TorrentItem() torrent['url'] = response.url torrent['name'] = x.select("//h1/text()").extract() torrent['description'] = x.select("//div[@id='description']").extract() torrent['size'] = x.select("//div[@id='info-left']/p[2]/text()[2]").extract() return torrent
def parse_carro(self, response): hxs = HtmlXPathSelector(response) carro = Carro() carro['modelo'] = hxs.select('//div[@class="section primary"]/h1/text()').extract()[0] anio, tipo = hxs.select('//div[@class="section primary"]/h4/text()').extract()[0].split("|") carro['anio'] = anio carro['tipo'] = tipo carro['precio'] = hxs.select('//div[@class="section primary"]/h3/text()').extract()[0].split(" ")[-1] carro['url'] = response.url yield carro
def parse(self, response): hxs = HtmlXPathSelector(response) item = DeputeInfoItem() item["party"] = hxs.select('//td[@id="cbfv_55"]/text()').extract() item["name"] = hxs.select('//h1[@class="title"]/span/span/text()').extract() account = hxs.select('//td[@id="cbfv_60"]/a/text()').extract() print(account) if(len(account) != 0): item["twitter"] = account[0].split('/')[-1] else: item["twitter"] = [] yield item
def parseLineType(self, response): hxs = HtmlXPathSelector(response) lineType = hxs.select('//*[@id="pagebar"]/h1/text()').extract()[0].strip() self.log('Processing %s...' % (lineType), level=log.DEBUG) items = hxs.select('//*[@id="tbl_fahrplaene"]/tbody/tr/td[2]/a') for item in items: url = urljoin(response.url, item.select('./@href').extract()[0]) req = Request(url, callback=self.parseFahrplan) req.meta['lineType'] = lineType req.meta['lineName'] = item.select('./text()').extract()[0].strip() self.log("Following URL %s" % (url), level=log.DEBUG) yield req
def parse_brands(self, response): hxs = HtmlXPathSelector(response) brands = hxs.select("//div[@id='contentFull']/fieldset[2]/div/p/a/@href") # for brand in brands: item = Link2LinkItem(response.meta['item']) products_category = hxs.select("//*[@id='contentFull']/fieldset[2]/div/p[2]/a/text()").extract() item['Brand'] = hxs.select("//*[@id='contentFull']/h1/text()").extract() if "Products" in hxs.select("//*[@id='contentFull']/fieldset[2]/legend/text()").extract()[0]: #Catagory exsist, i.e. Dog, Cat all_catagories_links = hxs.select("//*[@id='contentFull']/fieldset[2]/div/p[2]/a/@href").extract() index=0 # c_list=[] for product in products_category: item = Link2LinkItem(response.meta['item']) item['Brand'] = hxs.select("//*[@id='contentFull']/h1/text()").extract() catatory_link = all_catagories_links[index] item['Products_Category'] = product index = index + 1 # yield item yield Request(self.get_url(catatory_link), callback=self.parse_cats, meta={'item': item}) else: #direct product link is available. item['Brand'] = hxs.select("//*[@id='contentFull']/h1/text()").extract() if "Products_Category" not in item: item['Products_Category'] = "Not Available" all_product_links = hxs.select("//div[@id='contentFull']/fieldset[2]/div/p/a/@href").extract() for product_link in all_product_links: yield Request(self.get_url(product_link), callback=self.parse_products, meta={'item': item})
def parse_products(self, response): hxs = HtmlXPathSelector(response) item = Link2LinkItem(response.meta["item"]) item['Specification'] = hxs.select("//div[@id='tab1']/p/text()").extract() item['Product_Name'] = hxs.select("//*[@id='contentFull']/h1/text()").extract() ga = hxs.select("//*[@id='tab2']/p/text()").extract() if ga: item['Guaranteed_Analysis'] = ga else: item['Guaranteed_Analysis'] = "Not Available" item['Product_Description'] = hxs.select(".//*[@id='contentFull']/p/text()").extract() yield item
def scrapeFlights(self, response): """Scrape the flights into a Fare() object.""" html = HtmlXPathSelector(response) errors = html.select("//ul[@id='errors']/li/text()") # Catch errors given by Southwest's page if (len(errors) > 0): self.log("Error: %s" % theError, level=log.ERROR) return # Conveniently packaged flight info in string form for form submission xpath = '//div[@class="productPricing"]//input/@title' selectors = [ '//table[@id="faresOutbound"]//td[@class="price_column "]' + xpath, # business select '//table[@id="faresOutbound"]//td[@class="price_column"][1]' + xpath, # anytime '//table[@id="faresOutbound"]//td[@class="price_column"][2]' + xpath # wanna get away ] fareList = [] for selector in selectors: fareList.append(html.select(selector).extract()) # Process that info and load into a Fare() item. i = 0 fareType = ["Business Select", "Anytime", "Wanna Get Away"] for fare in fareList: self.log("Faretype: %d %s" % (i, fareType[i])) for flightString in fare: if (flightString[0] == 'D'): flightData = Util.parseFlight(flightString, self.outDate.date()) self.log("Found: %s" % flightString) flight = Fare() for key in flightData: flight[key] = flightData[key] flight['origin'] = self.origin flight['destination'] = self.destination flight['date'] = self.outDate flight['faretype'] = fareType[i] self.log('Added') yield flight else: continue i += 1
def test_htmlxpathselector(self): with warnings.catch_warnings(): warnings.simplefilter('ignore', ScrapyDeprecationWarning) hs = HtmlXPathSelector(text=self.text) self.assertEqual(hs.select("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></div>']) self.assertRaises(RuntimeError, hs.css, 'div')
def test_htmlxpathselector(self): with warnings.catch_warnings(record=True): hs = HtmlXPathSelector(text=self.text) self.assertEqual( hs.select("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></div>']) self.assertRaises(RuntimeError, hs.css, 'div')
def parse_yhzc(self, response): response_selector = HtmlXPathSelector(response) yhzcList = response_selector.select(self.yhzc_xpath).extract() yhzcStr = "".join(yhzcList) message = response.meta #��װ����Ϣ item = ParkBaseItem() item['preferential'] = yhzcStr.decode("utf-8") item['name'] = message['name'] item['level'] = message['level'] item['address'] = message['address'] item['area'] = message['area'] item['industry'] = message['industry'] item['image'] = message['image'] item['detail'] = message['detail'] item['plan'] = message['plan'] item['url'] = message['url'] item["sub_url"] = message["sub_url"] m2 = hashlib.md5() m2.update(item["url"]) item["url_md5"] = m2.hexdigest() item['created'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print item yield item yield scrapy.Request(url=item['url'] + "company/", callback=self.parse_company, meta={ "park_md5": item["url_md5"], "page": 1 }, headers=DEFAULT_REQUEST_HEADERS)
def test_htmlxpathselector(self): with warnings.catch_warnings(): warnings.simplefilter('ignore', ScrapyDeprecationWarning) hs = HtmlXPathSelector(text=self.text) self.assertEqual( hs.select("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></div>']) self.assertRaises(RuntimeError, hs.css, 'div')
def parse_cats(self, response): hxs = HtmlXPathSelector(response) item = Link2LinkItem(response.meta['item']) all_product_links = hxs.select("//div[@id='contentFull']/fieldset[2]/div/p/a/@href").extract() for product_link in all_product_links: yield Request(self.get_url(product_link), callback=self.parse_products, meta={'item': item})
def parse(self, response): hxs = HtmlXPathSelector(response) items = hxs.select('//*[@id="col3_content"]/div/div/div/a') for item in items: url = urljoin(response.url, item.select('./@href').extract()[0]) self.log("Following URL %s" % (url), level=log.DEBUG) yield Request(url, callback=self.parseLineType)
def parse_list(self, response): hxs = HtmlXPathSelector(response) activity_nodes = hxs.select("//li[@class='list-entry']/div[2]/div/a") for node in activity_nodes: activity_url = self.href(node) self.log('Found activity url: %s' % activity_url) yield Request(activity_url, callback = self.parse_activity)
def parse(self, response): hxs = HtmlXPathSelector(response) car_lis = hxs.select('//li[@class="standard_200 t gal-normal-mot"]') #Parsear cada li de carro for li in car_lis: carro = Carro() carro['modelo'] = li.select('div/h3/a/text()').extract()[0] anio, tipo = li.select('div[@class="itemInfo"]//h4/strong/text()').extract()[0].split("|") carro['anio'] = anio carro['tipo'] = tipo carro['precio'] = li.select('div[@class="itemInfo"]//li[@class="precio_gal"]/strong/text()' ).extract()[0].split(" ")[-1] carro['url'] = li.select('div/h3/a/@href').extract()[0] yield carro url_paginas = hxs.select("//div[@id='paginador']/a/@href").extract() for pagina in url_paginas: yield(Request(pagina, callback=self.parse))
def parse(self, response): hxs = HtmlXPathSelector(response) appsl = hxs.select('//div[contains(@class,"loops-wrapper list-thumb-image")]/div') for entry in appsl: item = AppslistItem() item["title"] = entry.select("div/h2/a/text()").extract() item["image_link"] = entry.select("p/a/img/@src").extract() item["desc"] = entry.select("div/p/text()").extract() yield item
def parse(self, response): hxs = HtmlXPathSelector(response) brands = hxs.select("//div[@id='contentFull']/div/p/a/@href") # self.item = Link2LinkItem() item = Link2LinkItem() for brand in brands: brand_page = brand.extract() request = Request(self.get_url(brand_page), callback=self.parse_brands,meta={'item':item}) yield request
def scrapeFlights(self, response): """Scrape the flights into a Fare() object.""" html = HtmlXPathSelector(response) errors = html.select("//ul[@id='errors']/li/text()") # Catch errors given by Southwest's page if ( len(errors) > 0 ): self.log("Error: %s" % theError , level=log.ERROR) return # Conveniently packaged flight info in string form for form submission xpath = '//div[@class="productPricing"]//input/@title' selectors = [ '//table[@id="faresOutbound"]//td[@class="price_column "]' + xpath, # business select '//table[@id="faresOutbound"]//td[@class="price_column"][1]' + xpath, # anytime '//table[@id="faresOutbound"]//td[@class="price_column"][2]' + xpath # wanna get away ] fareList = [] for selector in selectors: fareList.append( html.select(selector).extract() ) # Process that info and load into a Fare() item. i = 0 fareType = ["Business Select", "Anytime", "Wanna Get Away"] for fare in fareList: self.log("Faretype: %d %s" % (i, fareType[i]) ) for flightString in fare: if ( flightString[0] == 'D' ): flightData = Util.parseFlight(flightString, self.outDate.date()) self.log("Found: %s" % flightString) flight = Fare() for key in flightData: flight[key] = flightData[key] flight['origin'] = self.origin flight['destination'] = self.destination flight['date'] = self.outDate flight['faretype'] = fareType[i] self.log('Added') yield flight else: continue i += 1
def parse_plan(self, response): response_selector = HtmlXPathSelector(response) planList = response_selector.select(self.plan_xpath).extract() planStr = "".join(planList) baseUrl = response.meta['url'] response.meta['plan'] = planStr.decode("utf-8") yield scrapy.Request(url=baseUrl + "yhzc/", callback=self.parse_yhzc, meta=response.meta, headers=DEFAULT_REQUEST_HEADERS)
def parse_detail(self, response): response_selector = HtmlXPathSelector(response) detailList = response_selector.select(self.detail_xpath).extract() detailStr = "".join(detailList) baseUrl = response.meta['url'] response.meta['detail'] = detailStr.decode("utf-8") yield scrapy.Request(url=baseUrl + "plan/", callback=self.parse_plan, meta=response.meta, headers=DEFAULT_REQUEST_HEADERS)
def parse_next_page(self, response): print ("Fetch group home page: %s" % response.url) hxs = HtmlXPathSelector(response) pages_num = hxs.select("//*[@id='J_bottomPage']/span[2]/em[1]/b/text()").extract()[0] print pages_num page_url = hxs.select("//*[@id='J_bottomPage']/span[1]/a[2]/@href").extract()[0] print page_url PAGE_PATTERN = r"(.+page=)[\d]+(.+)" parse = re.compile(PAGE_PATTERN, re.UNICODE | re.VERBOSE) match = parse.search(page_url) if match: pre_url = match.group(1) post_url = match.group(2) for i in xrange(1, int(pages_num) + 1): # iterate each page page_url = "http://list.jd.com" + pre_url + str(i) + post_url r = Request(page_url, callback=self.parse_detail) yield r else: print "NONONONO!"
def parse_activity(self, response): hxs = HtmlXPathSelector(response) event_info_node = hxs.select('//div[@class="event-info"]') title = self.text(event_info_node.select('h1')) event_detail_nodes = event_info_node.select('div[@class="event-detail"]') details = {} for n in event_detail_nodes: self.log(n.extract()) key = self.text(n.select('span')).strip().replace(':','') value = re.sub('<[^<]+?>', '', n.extract()).split(':')[1].strip() details[key] = value description = '\n'.join(hxs.select('//div[@class="related_info"]/div/div/text()').extract()) photo_urls = hxs.select('//ul[contains(@class,"event-detail-photo")]/li/a/img/@src').extract() photo_urls = map(lambda x:x.replace('albumicon', 'photo'), photo_urls) entry = Activity() entry['id'] = response.url entry['title'] = title entry['description'] = description entry['images'] = photo_urls entry['details'] = details return entry
def parse(self, response): hxs = HtmlXPathSelector(response) last_page_node = hxs.select("//div[@class='paginator']/a[last()]") last_page = int(self.href(last_page_node).split('=')[1]) pages = range(10, last_page, 10) #pages = [] for page in pages: page_url = response.url + '?start=' + str(page) self.log('Found page url: %s' % page_url) yield Request(page_url, callback = self.parse_list) # process first page # self.parse_list(response) yield Request(response.url, callback=self.parse_list)
def commons_speech_feeder(working_dir, _fetch_url=None): """Return a generator that yields file urls""" # TODO: find a faster way of doing this if not _fetch_url: _fetch_url = fetch_url list_url = 'http://ukparse.kforge.net/parldata/scrapedxml/debates/' log.debug("Fetching index") data = _fetch_url(list_url, "Failed to fetch index.") if data: hxs = HtmlXPathSelector(text=unicode(data, errors="ignore")) selector = hxs.select(r'//table//td//a/@href') check_href = create_href_checker(re.compile(r'^debates\d{4}'), working_dir) urls = selector.extract() log.debug("Fetched %s urls from index" % len(urls)) for href in urls: if check_href(href): yield urlparse.urljoin(list_url, href)
def parse_list(self, response): hxs = HtmlXPathSelector(response) # print(hxs.extract()) body = hxs.select('//div[@id="contents"]') # print(body.extract()) item = MovieItem() item['title'] = body.select('//div[@id="bo_v_title"]/h1/text()').extract()[0] item['link'] = body.select('//div[@class="bo_v_file"]/a/@href').extract()[1] item['date'] = body.select('//div[@id="bo_v_info"]/table/tbody/tr/td/table/tbody/tr/td/text()').extract()[2] if not item['title']: print("Table Title") else: print(item) yield item
def _parse_jd(self, response): r = response.request hxs = HtmlXPathSelector(response) items = hxs.select("//li[@class='item']/div[@class='store']") if items: yield self._jd_request(r.meta['level'], r.meta['id'], r.meta['name'], r.meta['page']+1) for item in items: name = item.select('.//h3/a/text()').extract()[0] rating = item.select(".//div[@class='rank']/span/@class").extract()[0] tags = item.select(".//div[@class='tag']/a/text()").extract() path = item.select('.//h3/a/@href').extract()[0] place = MafengwoPlace() place['name'] = name if rating: place['rating'] = int(rating[4:]) place['tags'] = tags place['id'] = self._extract_id(path) place['p_id'] = r.meta['id'] place['level'] = self._sub_level(r.meta['level']) yield self._jd_detail_request(place, path)
def test_htmlxpathselector(self): with warnings.catch_warnings(record=True): hs = HtmlXPathSelector(text=self.text) self.assertEqual(hs.select("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></div>']) self.assertRaises(RuntimeError, hs.css, 'div')
def parse_fund(self, response): x = HtmlXPathSelector(response) fund = HlscraperItem() fund['Url'] = response.url fund['Name'] = x.select("normalize-space(/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/text())").extract() fund['ExdividendDate'] = x.select("normalize-space(//tr/th[text()[contains(., 'Ex-dividend date')]]/../td/text())").extract() fund['PaymentDate'] = x.select("normalize-space(//tr/th[text()[contains(., 'Payment date')]]/../td/text())").extract() fund['RunningYield'] = x.select("normalize-space(//tr/th[text()[contains(., 'Running yield')]]/../td/text())").extract() fund['HistoricYield'] = x.select("normalize-space(//tr/th[text()[contains(., 'Historic yield')]]/../td/text())").extract() fund['IncomePaid'] = x.select("normalize-space(//tr/th[text()[contains(., 'Income paid')]]/../td/text())").extract() fund['TypeOfPayment'] = x.select("normalize-space(//tr/th[text()[contains(., 'Type of payment')]]/../td/text())").extract() fund['LaunchDate'] = x.select("normalize-space(//tr/th[text()[contains(., 'Launch date')]]/../td/text())").extract() fund['Sector'] = x.select("normalize-space(//tr/th[text()[contains(., 'Sector')]]/../td/text())").extract() fund['FundSize'] = x.select("normalize-space(//tr/th[text()[contains(., 'Fund size')]]/../td/text())").extract() fund['NumberOfHoldings'] = x.select("normalize-space(//tr/th[text()[contains(., 'Number of holdings')]]/../td/text())").extract() fund['TypeOfUnits'] = x.select("normalize-space(//tr/th[text()[contains(., 'Type of units')]]/../td/text())").extract() fund['FundType'] = x.select("normalize-space(//tr/th[text()[contains(., 'Fund type')]]/../td/text())").extract() fund['NetInitialCharge'] = x.select("normalize-space(//tr/th[text()[contains(., 'Net initial charge')]]/../td/text())").extract() fund['NetAnnualCharge'] = x.select("normalize-space(//tr/th[text()[contains(., 'Net Annual charge')]]/../td/text())").extract() fund['OtherExpenses'] = x.select("normalize-space(//tr/th[text()[contains(., \"Fund manager's other expenses\")]]/../td/text())").extract() fund['PerformanceFee'] = x.select("normalize-space(//tr/th[text()[contains(., 'Performance fee')]]/../td/text())").extract() fund['PlatformFee'] = x.select("normalize-space(//tr/th[text()[contains(., 'HL Platform charge')]]/../td/text())").extract() fund['Wealth150'] = x.select("/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/a/img/@src").extract() return fund
def parse_fund(self, response): x = HtmlXPathSelector(response) fund = HlscraperItem() fund['Url'] = response.url fund['Name'] = x.select( "normalize-space(/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/text())" ).extract() fund['ExdividendDate'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Ex-dividend date')]]/../td/text())" ).extract() fund['PaymentDate'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Payment date')]]/../td/text())" ).extract() fund['RunningYield'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Running yield')]]/../td/text())" ).extract() fund['HistoricYield'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Historic yield')]]/../td/text())" ).extract() fund['IncomePaid'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Income paid')]]/../td/text())" ).extract() fund['TypeOfPayment'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Type of payment')]]/../td/text())" ).extract() fund['LaunchDate'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Launch date')]]/../td/text())" ).extract() fund['Sector'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Sector')]]/../td/text())" ).extract() fund['FundSize'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Fund size')]]/../td/text())" ).extract() fund['NumberOfHoldings'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Number of holdings')]]/../td/text())" ).extract() fund['TypeOfUnits'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Type of units')]]/../td/text())" ).extract() fund['FundType'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Fund type')]]/../td/text())" ).extract() fund['NetInitialCharge'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Net initial charge')]]/../td/text())" ).extract() fund['NetAnnualCharge'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Net Annual charge')]]/../td/text())" ).extract() fund['OtherExpenses'] = x.select( "normalize-space(//tr/th[text()[contains(., \"Fund manager's other expenses\")]]/../td/text())" ).extract() fund['PerformanceFee'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Performance fee')]]/../td/text())" ).extract() fund['PlatformFee'] = x.select( "normalize-space(//tr/th[text()[contains(., 'HL Platform charge')]]/../td/text())" ).extract() fund['Wealth150'] = x.select( "/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/a/img/@src" ).extract() return fund