def test_get_function(self): func = Player.get_function('height1') html = '<html><body><height>3' self.assertEquals(3, func(XpathSelector(parse_html(html)))) func = Player.get_function('height2') html = '<html><body><height>3' self.assertEquals(3, func(XpathSelector(parse_html(html))))
def set_input_by_number(self, number, value): """ Set the value of form element by its number in the form :param number: number of element :param value: value which should be set to element """ sel = XpathSelector(self.form) elem = sel.select('.//input[@type="text"]')[number].node() return self.set_input(elem.get('name'), value)
def set_input_by_id(self, _id, value): """ Set the value of form element by its `id` attribute. :param _id: id of element :param value: value which should be set to element """ xpath = './/*[@id="%s"]' % _id if self._lxml_form is None: self.choose_form_by_element(xpath) sel = XpathSelector(self.form) elem = sel.select(xpath).node() return self.set_input(elem.get('name'), value)
def task_article(self, grab, task): page_header = grab.doc.select("//span[@class='post_title']").one( default=XpathSelector('No Header!')).text() page_favs = grab.doc.select("//*[@class='favs_count']").one( default=XpathSelector('0')).text() page_score = grab.doc.\ select("//div[@class='infopanel_wrapper']//div[contains(@class, 'mark')]/span[@class='score']").\ one(default=XpathSelector('0')).text() page_comments_count = grab.doc.select("//*[@id='comments_count']").one( default=XpathSelector('0')).text() page_author = grab.doc.select("//div[@class='author']/a").one( default=XpathSelector('No Author!')).text() page_date = grab.doc.select("//div[@class='published']").one( default=XpathSelector('0')).text() self.save_result( data={ 'page': task.current_page, 'url': task.url, 'header': page_header, 'author': page_author, 'favorites': page_favs, 'score': page_score, 'comments': page_comments_count, 'date': page_date })
def main(**kwargs): data = open('data/awesome_python.html', 'rb').read() start = time.time() for x in range(500): tree = fromstring(data) assert tree.xpath('//title')[0].text.startswith('awesome-web-scraping') print('lxml:xpath %.2f' % (time.time() - start)) start = time.time() for x in range(500): tree = fromstring(data) assert XpathSelector(tree).select('//title').text().startswith('awesome-web-scraping') print('selection:select %.2f' % (time.time() - start))
def get_prices(self, grab, subject): """Parsing information about Obligatory extras and Optional extras for objects are prices and optional""" prices = [] try: extras = grab.doc.rex_text( '<h3 class\="h6 copy-sp-m">.*?%s.*?</h3>(.+?)</ul>' % subject, flags=re.S) except DataNotFound: logging.debug("Price %s is not found on %s" % (subject, grab.doc.url)) return None sel = XpathSelector(fromstring(extras)) prices = [] for li in sel.select('//li[@class="list__item u-cf"]'): obligatory = OrderedDict() obligatory['name'] = li.select('node()').text() money = li.select('node()/strong').text() obligatory['value'] = money[1:].replace(',', '') # Find perweek or perday if li.select('span[@class="boatview__extras-amount"' + ' and contains(text(),"per week")]').exists(): obligatory['perweek'] = True elif li.select('span[@class="boatview__extras-amount"' + ' and contains(text(),"per day")]').exists(): obligatory['perday'] = True obligatory['currency'] = money[0] prices.append(obligatory) if len(prices) < 1: logging.debug("Price %s contains less than one element on: %s" % (subject, grab.doc.url)) return None return prices
def xpath(self, query): return XpathSelector(self.dom()).select(query)
def select(self, *args, **kwargs): return XpathSelector(self.tree).select(*args, **kwargs)
def select(self, xpath): sel = XpathSelector(self.dom_tree) return sel.select(xpath)