def task_parse_page(self, grab, task): try: if len(grab.doc.select('//*[@id="profile_info"]/h4/div[contains(@class, "profile_deleted")]').text()) > 0: # print("-- Hidden user's page") return except Exception: pass try: user_id = task.user_id username = grab.doc.select('//*[@id="profile_info"]/h4/div[contains(@class, "page_name")]').text() city = grab.doc.select( "//*[@id='profile_full_info']//div[contains(@class, 'clear_fix') and div[@class='label fl_l'] = 'Город:']/div[@class='labeled fl_l']/a")\ .one(default=XpathSelector('')).text() languages = grab.doc.select( "//*[@id='profile_info']//div[contains(@class, 'clear_fix') and div[@class='label fl_l'] = 'Языки:']/div[@class='labeled fl_l']/a")\ .one(default=XpathSelector('')).text() # print((user_id, username, city, languages)) self.cur_col += 1 self.ws.write(self.cur_col, 0, user_id) self.ws.write(self.cur_col, 1, username) self.ws.write(self.cur_col, 2, city) self.ws.write(self.cur_col, 3, languages) self.parsed += 1 except Exception: pass
def test_number(self): self.assertEquals( 4, XpathSelector(self.tree).select('//ul/li[last()]').number()) self.assertEquals( 6, XpathSelector(self.tree).select('//ul/li[last()]/@id').number())
def test_get_function(self): func = Player.get_function('height1') html = '<html><body><height>3' self.assertEquals(3, func(XpathSelector(parse_html(html)))) func = Player.get_function('height2') html = '<html><body><height>3' self.assertEquals(3, func(XpathSelector(parse_html(html))))
def test_incorrect_xpath(self): # The lxml xpath function return boolean for following xpath # This breaks selector internal logic that assumes that only # list could be returnsed # So it was fixed and this test was crated sel = XpathSelector(self.tree).select('//ul/li/text()="oops"') self.assertEquals(False, sel.exists()) # Selector list is always empty in this special case # Even if the xpath return True on lxml level self.assertEquals(True, self.tree.xpath('//ul[1]/li[1]/text()="one"')) sel = XpathSelector(self.tree).select('//ul[1]/li[1]/text()="one"') self.assertEquals(False, sel.exists())
def test_xpath_selector(self): tree = fromstring('<div>test</div>') out = StringIO() with mock.patch('sys.stderr', out): XpathSelector(tree) self.assertTrue( 'using XpathSelector from deprecated' in out.getvalue())
def _build_selector(cls, tree, selector_type): if selector_type == 'xpath': return XpathSelector(tree) elif selector_type == 'json': return JsonSelector(tree) else: raise GrabMisuseError('Unknown selector type: %s' % selector_type)
def test_number(self): sel = XpathSelector(self.tree).select('//ul/li[4]') self.assertEquals(4, sel.number())
def test_html(self): sel = XpathSelector(self.tree.xpath('//h1')[0]) self.assertEquals('<h1>test</h1>', sel.html().strip())
def test_attr_list(self): root = XpathSelector(self.tree) self.assertEquals(set(['li-1', 'li-2']), set(root.select('//ul[@id="second-list"]/li')\ .attr_list('class')) )
def test_one(self): sel = XpathSelector(self.tree).select('//ul/li') self.assertEquals('one', sel.one().node.text) self.assertEquals('one', sel.text())
def test_select_select(self): root = XpathSelector(self.tree) self.assertEquals( set(['one', 'yet one']), set([x.text() for x in root.select('//ul').select('./li[1]')]), )
def test_text_list(self): root = XpathSelector(self.tree) self.assertEquals( set(['one', 'yet one']), set(root.select('//ul/li[1]').text_list()), )
def _selector(self): return XpathSelector(self._node)
def test_text_selector(self): sel = XpathSelector(self.tree).select('//li/text()').one() self.assertTrue(isinstance(sel, TextSelector))
def test_textselector(self): self.assertEquals( 'one', XpathSelector(self.tree).select('//li/text()').text())
def select(self, *args, **kwargs): return XpathSelector(self.tree).select(*args, **kwargs)
def test_select_node(self): self.assertEquals('test', XpathSelector(self.tree).select('//h1')[0].node.text)
def test_in_general(self): sel = XpathSelector(self.tree)
def test_exists(self): sel = XpathSelector(self.tree).select('//ul/li[4]') self.assertEquals(True, sel.exists()) sel = XpathSelector(self.tree).select('//ul/li[5]') self.assertEquals(False, sel.exists())