def parse_flight(e, route_leg): g0 = Grab() g0.response.body = etree.tostring(e) results = [] offset_days = 0 for f0 in g0.xpath_list('./li'): g = Grab() g.response.body = etree.tostring(f0) f = g.css_list('ul>li') def ff(index): return re.match(flight_field_pattern, etree.tostring(f[index])).group(1) h = re.match(origin_destination_pattern, g.css_text('h5')) def hh(index): return h.group(index) arrival_pattern_plus_day = re.compile('(\d+:\d+ [AP]M) \\+ (\d+) [dD]ay') if route_leg: base_date = request["return_date"] else: base_date = request["depart_date"] departure = ptime.get_full_date(str(base_date), ff(2)) departure += datetime.timedelta(days = offset_days) arrival_time = ff(3) arrival_date = ptime.get_date(base_date) arrival_plus_day = re.match(arrival_pattern_plus_day, arrival_time.strip()) if arrival_plus_day: offset_days += 1 arrival_time = arrival_plus_day.group(1) arrival_date += datetime.timedelta(days = offset_days) arrival = ptime.get_full_date(arrival_date, arrival_time) def sep_number(s): r = re.match(re.compile('([\w\d][\w\d])(\d+)'), str(s)) return r.group(1), r.group(2) airline, number = sep_number(ff(5)) results.append({ "number":number, "airline":airline, #ff(4), "origin":hh(1), "destination":hh(2), "departure":ptime.response_date(departure), "arrival":ptime.response_date(arrival), "duration":None, #ptime.str_timedelta(departure, arrival), "route_leg":str(int(route_leg)) , "aircraft":None, "__main_airline":airline, #ff(4) }) return results
def weather(city,lan): g = Grab() g.setup(document_charset='utf-8') g.go('https://p.ya.ru/'+city) wstr=g.doc.select('//div[@class="today-forecast"]').text() wstr=g.css_text('.temperature-wrapper')+'. '+wstr key='trnsl.1.1.20160427T193202Z.39c058144b8ba50d.0c06365f68745560062f765550cb7a548557ee17' g.go('https://translate.yandex.net/api/v1.5/tr/translate?key='+key+'&lang='+lan+'&text='+wstr) wstr=g.doc.select('//text').text() return wstr
def get_and_show_proxy(): g = Grab() g.go('www.google.com/search?num=100&q=' + quote('free proxy +":8080"')) rex = re.compile(r'(?:(?:[-a-z0-9]+\.)+)[a-z0-9]+:\d{2,4}') #rex = re.compile(r'([0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[:]\d{2,4})',re.MULTILINE) #for checking, proxy in enumerate(rex.findall(g.css_text('body').replace(' ',''))): for checking, proxy in enumerate(rex.findall(g.css_text('body'))): #sys.stdout.seek(0) print checking,":", proxy #sys.stdout.seek(0,0) g.setup(proxy=proxy, proxy_type='http', connect_timeout=5, timeout=5) try: g.go('google.com') except GrabError: pass #print proxy, 'FAIL' else: #print proxy, 'OK' # @type proxy_list list proxy_list.append(proxy)
from grab import Grab import logging logging.basicConfig(level=logging.DEBUG) g = Grab() g.go('http://habrahabr.ru') g.xpath('//h2/a[@class="topic"]').get('href') print(g.xpath_text('//h2/a[@class="topic"]')) print(g.css_text('h2 a.topic')) print('Comments:', g.css_number('.comments .all')) from urllib.parse import urlsplit print(', '.join(urlsplit(x.get('href')).netloc for x in g.css_list('.hentry a') if not 'habrahabr.ru' in x.get('href') and x.get('href').startswith('http:')))
from grab import Grab from grab.ext import lxml import google_lib as golib query='crimea' #file_out='out1.html' go_url = golib._url(query) g = Grab() #g.go(go_url, log_file=file_out) g.go(go_url) print g.css_text('#resultStats') print '---' #print g.xpath_text('//*[h3[@class="r"]/a]') print g.xpath_text('//div [@class="slp"]') print '---' #print g.css_text('#search') #print g.xpath_text('//h2[@class="hd"]')
word = Tk() word.title = '' word.state = 'iconic' #word.iconify() #word.withdraw() mword = word.selection_get() g = Grab() g.request( log_file="/tmp/pygdic.log", url="http://translate.google.com/translate_t?hl=ru&langpair=auto|ru&text=" + mword) autp = g.css_text('span#result_box') trancecript = g.css_text('div#src-translit') outplen = len(autp) / dlstr for i in xrange(outplen + 1): ostr += autp[dlstr * i:dlstr * (i + 1)] + "\n" label = Label(word, width=dlstr + 2, text=ostr + "\n" + trancecript, font="Arial 14", bg="#ffffaa", fg="blue") label.pack(expand=True) word.mainloop() print autp, trancecript
word.destroy() word=Tk() word.title='' word.state='iconic' #word.iconify() #word.withdraw() mword = word.selection_get() g=Grab() g.request(log_file="/tmp/pygdic.log", url="http://translate.google.com/translate_t?hl=ru&langpair=auto|ru&text="+mword) autp=g.css_text('span#result_box') trancecript=g.css_text('div#src-translit') outplen=len(autp)/dlstr for i in xrange(outplen+1): ostr+=autp[dlstr*i:dlstr*(i+1)]+"\n" label= Label(word,width=dlstr+2, text = ostr+"\n"+trancecript , font="Arial 14", bg="#ffffaa",fg="blue") label.pack(expand=True) word.mainloop() print autp,trancecript exit()
from grab import Grab, GrabError from urllib import quote import re g = Grab() g.go('http://www.google.ru/search?num=100&q=' + quote('free proxy +":8080"')) rex = re.compile(r'(?:(?:[-a-z0-9]+\.)+)[a-z0-9]+:\d{2,4}') for proxy in rex.findall(g.drop_space(g.css_text('body'))): g.setup(proxy=proxy, proxy_type='http', connect_timeout=5, timeout=5) try: g.go('http://google.com') except GrabError: print(proxy, 'FAIL') else: print(proxy, 'OK')
class LXMLExtensionTest(TestCase): def setUp(self): SERVER.reset() # Create fake grab instance with fake response self.g = Grab(transport=GRAB_TRANSPORT) self.g.fake_response(HTML, charset='cp1251') from lxml.html import fromstring self.lxml_tree = fromstring(self.g.response.body) def test_lxml_text_content_fail(self): # lxml node text_content() method do not put spaces between text # content of adjacent XML nodes self.assertEqual( self.lxml_tree.xpath('//div[@id="bee"]/div') [0].text_content().strip(), u'пчела') self.assertEqual( self.lxml_tree.xpath('//div[@id="fly"]')[0].text_content().strip(), u'му\nха') def test_lxml_xpath(self): names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*')) self.assertEqual(set(['em', 'div', 'strong', 'style', 'script']), names) names = set(x.tag for x in self.lxml_tree.xpath( '//div[@id="bee"]//*[name() != "script" and name() != "style"]')) self.assertEqual(set(['em', 'div', 'strong']), names) def test_xpath(self): self.assertEqual('bee-em', self.g.xpath_one('//em').get('id')) self.assertEqual( 'num-2', self.g.xpath_one(u'//*[text() = "item #2"]').get('id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_one('//em[@id="baz"]')) self.assertEqual(None, self.g.xpath_one('//zzz', default=None)) self.assertEqual('foo', self.g.xpath_one('//zzz', default='foo')) def test_xpath_text(self): self.assertEqual(u'пче ла', self.g.xpath_text('//*[@id="bee"]', smart=True)) self.assertEqual(u'пчела mozilla = 777; body { color: green; }', self.g.xpath_text('//*[@id="bee"]', smart=False)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.xpath_text('/html/body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//code')) self.assertEqual(u'bee', self.g.xpath_one('//*[@id="bee"]/@id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//*[@id="bee2"]/@id')) def test_xpath_number(self): self.assertEqual(100, self.g.xpath_number('//li')) self.assertEqual(100, self.g.xpath_number('//li', make_int=True)) self.assertEqual('100', self.g.xpath_number('//li', make_int=False)) self.assertEqual(1002, self.g.xpath_number('//li', ignore_spaces=True)) self.assertEqual( '1002', self.g.xpath_number('//li', ignore_spaces=True, make_int=False)) self.assertRaises(DataNotFound, lambda: self.g.xpath_number('//liza')) self.assertEqual('foo', self.g.xpath_number('//zzz', default='foo')) def test_xpath_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.xpath_list('//li')]) def test_css(self): self.assertEqual('bee-em', self.g.css_one('em').get('id')) self.assertEqual('num-2', self.g.css_one('#num-2').get('id')) self.assertRaises(DataNotFound, lambda: self.g.css_one('em#baz')) self.assertEqual('foo', self.g.css_one('zzz', default='foo')) def test_css_text(self): self.assertEqual(u'пче ла', self.g.css_text('#bee', smart=True)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.css_text('html body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.css_text('code')) self.assertEqual('foo', self.g.css_text('zzz', default='foo')) def test_css_number(self): self.assertEqual(100, self.g.css_number('li')) self.assertEqual('100', self.g.css_number('li', make_int=False)) self.assertEqual(1002, self.g.css_number('li', ignore_spaces=True)) self.assertRaises(DataNotFound, lambda: self.g.css_number('liza')) self.assertEqual('foo', self.g.css_number('zzz', default='foo')) def test_css_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.css_list('li')]) def test_strip_tags(self): self.assertEqual('foo', self.g.strip_tags('<b>foo</b>')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b> <i>bar')) self.assertEqual('foobar', self.g.strip_tags('<b>foo</b><i>bar')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar', smart=True)) self.assertEqual('', self.g.strip_tags('<b> <div>')) def test_css_exists(self): self.assertTrue(self.g.css_exists('li#num-1')) self.assertFalse(self.g.css_exists('li#num-3')) def test_xpath_exists(self): self.assertTrue(self.g.xpath_exists('//li[@id="num-1"]')) self.assertFalse(self.g.xpath_exists('//li[@id="num-3"]')) def test_cdata_issue(self): g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) # By default HTML DOM builder is used # It handles CDATA incorrectly self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual(None, g.tree.xpath('//weight')[0].text) # But XML DOM builder produces valid result #self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text) # Use `content_type` option to change default DOM builder g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) g.setup(content_type='xml') self.assertEqual('30', g.xpath_one('//weight').text) self.assertEqual('30', g.tree.xpath('//weight')[0].text) def test_xml_declaration(self): """ HTML with XML declaration shuld be processed without errors. """ SERVER.RESPONSE['get'] = """<?xml version="1.0" encoding="UTF-8"?> <html><body><h1>test</h1></body></html> """ g = Grab() g.go(SERVER.BASE_URL) self.assertEqual('test', g.xpath_text('//h1')) def test_empty_document(self): SERVER.RESPONSE['get'] = 'oops' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag') SERVER.RESPONSE['get'] = '<frameset></frameset>' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag')
def parse_flight(e, route_leg): g0 = Grab() g0.response.body = etree.tostring(e) results = [] offset_days = 0 for f0 in g0.xpath_list('./li'): g = Grab() g.response.body = etree.tostring(f0) f = g.css_list('ul>li') def ff(index): return re.match(flight_field_pattern, etree.tostring(f[index])).group(1) h = re.match(origin_destination_pattern, g.css_text('h5')) def hh(index): return h.group(index) arrival_pattern_plus_day = re.compile( '(\d+:\d+ [AP]M) \\+ (\d+) [dD]ay') if route_leg: base_date = request["return_date"] else: base_date = request["depart_date"] departure = ptime.get_full_date(str(base_date), ff(2)) departure += datetime.timedelta(days=offset_days) arrival_time = ff(3) arrival_date = ptime.get_date(base_date) arrival_plus_day = re.match(arrival_pattern_plus_day, arrival_time.strip()) if arrival_plus_day: offset_days += 1 arrival_time = arrival_plus_day.group(1) arrival_date += datetime.timedelta(days=offset_days) arrival = ptime.get_full_date(arrival_date, arrival_time) def sep_number(s): r = re.match(re.compile('([\w\d][\w\d])(\d+)'), str(s)) return r.group(1), r.group(2) airline, number = sep_number(ff(5)) results.append({ "number": number, "airline": airline, #ff(4), "origin": hh(1), "destination": hh(2), "departure": ptime.response_date(departure), "arrival": ptime.response_date(arrival), "duration": None, #ptime.str_timedelta(departure, arrival), "route_leg": str(int(route_leg)), "aircraft": None, "__main_airline": airline, #ff(4) }) return results
class LXMLExtensionTest(TestCase): def setUp(self): SERVER.reset() # Create fake grab instance with fake response self.g = Grab(transport=GRAB_TRANSPORT) self.g.fake_response(HTML, charset='cp1251') from lxml.html import fromstring self.lxml_tree = fromstring(self.g.response.body) def test_lxml_text_content_fail(self): # lxml node text_content() method do not put spaces between text # content of adjacent XML nodes self.assertEqual(self.lxml_tree.xpath('//div[@id="bee"]/div')[0].text_content().strip(), u'пчела') self.assertEqual(self.lxml_tree.xpath('//div[@id="fly"]')[0].text_content().strip(), u'му\nха') def test_lxml_xpath(self): names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*')) self.assertEqual(set(['em', 'div', 'strong', 'style', 'script']), names) names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*[name() != "script" and name() != "style"]')) self.assertEqual(set(['em', 'div', 'strong']), names) def test_xpath(self): self.assertEqual('bee-em', self.g.xpath_one('//em').get('id')) self.assertEqual('num-2', self.g.xpath_one(u'//*[text() = "item #2"]').get('id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_one('//em[@id="baz"]')) self.assertEqual(None, self.g.xpath_one('//zzz', default=None)) self.assertEqual('foo', self.g.xpath_one('//zzz', default='foo')) def test_xpath_text(self): self.assertEqual(u'пче ла', self.g.xpath_text('//*[@id="bee"]', smart=True)) self.assertEqual(u'пчела mozilla = 777; body { color: green; }', self.g.xpath_text('//*[@id="bee"]', smart=False)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.xpath_text('/html/body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//code')) self.assertEqual(u'bee', self.g.xpath_one('//*[@id="bee"]/@id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//*[@id="bee2"]/@id')) def test_xpath_number(self): self.assertEqual(100, self.g.xpath_number('//li')) self.assertEqual(100, self.g.xpath_number('//li', make_int=True)) self.assertEqual('100', self.g.xpath_number('//li', make_int=False)) self.assertEqual(1002, self.g.xpath_number('//li', ignore_spaces=True)) self.assertEqual('1002', self.g.xpath_number('//li', ignore_spaces=True, make_int=False)) self.assertRaises(DataNotFound, lambda: self.g.xpath_number('//liza')) self.assertEqual('foo', self.g.xpath_number('//zzz', default='foo')) def test_xpath_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.xpath_list('//li')]) def test_css(self): self.assertEqual('bee-em', self.g.css_one('em').get('id')) self.assertEqual('num-2', self.g.css_one('#num-2').get('id')) self.assertRaises(DataNotFound, lambda: self.g.css_one('em#baz')) self.assertEqual('foo', self.g.css_one('zzz', default='foo')) def test_css_text(self): self.assertEqual(u'пче ла', self.g.css_text('#bee', smart=True)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.css_text('html body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.css_text('code')) self.assertEqual('foo', self.g.css_text('zzz', default='foo')) def test_css_number(self): self.assertEqual(100, self.g.css_number('li')) self.assertEqual('100', self.g.css_number('li', make_int=False)) self.assertEqual(1002, self.g.css_number('li', ignore_spaces=True)) self.assertRaises(DataNotFound, lambda: self.g.css_number('liza')) self.assertEqual('foo', self.g.css_number('zzz', default='foo')) def test_css_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.css_list('li')]) def test_strip_tags(self): self.assertEqual('foo', self.g.strip_tags('<b>foo</b>')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b> <i>bar')) self.assertEqual('foobar', self.g.strip_tags('<b>foo</b><i>bar')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar', smart=True)) self.assertEqual('', self.g.strip_tags('<b> <div>')) def test_css_exists(self): self.assertTrue(self.g.css_exists('li#num-1')) self.assertFalse(self.g.css_exists('li#num-3')) def test_xpath_exists(self): self.assertTrue(self.g.xpath_exists('//li[@id="num-1"]')) self.assertFalse(self.g.xpath_exists('//li[@id="num-3"]')) def test_cdata_issue(self): g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) # By default HTML DOM builder is used # It handles CDATA incorrectly self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual(None, g.tree.xpath('//weight')[0].text) # But XML DOM builder produces valid result #self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text) # Use `content_type` option to change default DOM builder g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) g.setup(content_type='xml') self.assertEqual('30', g.xpath_one('//weight').text) self.assertEqual('30', g.tree.xpath('//weight')[0].text) def test_xml_declaration(self): """ HTML with XML declaration shuld be processed without errors. """ SERVER.RESPONSE['get'] = """<?xml version="1.0" encoding="UTF-8"?> <html><body><h1>test</h1></body></html> """ g = Grab() g.go(SERVER.BASE_URL) self.assertEqual('test', g.xpath_text('//h1')) def test_empty_document(self): SERVER.RESPONSE['get'] = 'oops' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag') SERVER.RESPONSE['get'] = '<frameset></frameset>' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag')