def get_issues(url): r = requests.get(url) html = r.text if 'new_list2.jsp' in url: elems = get.webpage(html).xpath('//a/text()') elif 'new_list3.jsp' in url: elems = get.webpage(html).xpath('//td/@title') else: raise Exception('New DOM type.') return elems
def google(query): url = settings['base_url'] + urllib2.quote(query.encode('utf-8')) f = get.htmltree(url) p = get.webpage(f) x = get.text(p, settings['xpath'])[0] n = re.findall(r'[0-9]+', x) return int(''.join(n))
def get_car_ids(path=None): if path: with open(path, 'r') as f: return f.read().splitlines() else: f = get.htmltree(settings.LIST_URL[settings.BRAND]) root = get.webpage(f) car_ids = root.xpath('//table[@class="car_list"]//tr/td[@class="inf"]/a[@class="newLink"]/@href') return [id.split('=')[1].split('&')[0] for id in car_ids]
def get_car_ids(path=None): if path: with open(path, 'r') as f: return f.read().splitlines() else: f = get.htmltree(settings.LIST_URL[settings.BRAND]) root = get.webpage(f) car_ids = root.xpath( '//table[@class="car_list"]//tr/td[@class="inf"]/a[@class="newLink"]/@href' ) return [id.split('=')[1].split('&')[0] for id in car_ids]
def get_car_info(encar_id): info = dict(encar_id=encar_id) url = settings.car_baseurl % encar_id f = get.htmltree(url) root = get.webpage(f) # summary summary = root.xpath('//div[@class="section summary hproduct"]')[0] info['name'] = [i.strip()\ for i in summary.xpath('.//h3[@class="car"]/span/text()')] info['transmission'] =\ x(summary, './/div[@class="short"]//li[@class="trs"]/text()') info['fuel'] = x(summary, './/div[@class="short"]//li[@class="fue"]/i/text()') info['engine'] = x(summary, './/div[@class="short"]//li[@class="eng"]/text()') info['type'] = x(summary, './/div[@class="short"]//li[@class="typ"]/text()') try: info['tags'] = x(summary, './/div[@class="merit"]//span/text()') except IndexError: info['tags'] = [] info['price'] = int(x(summary, './/div[@class="prc"]//strong/text()')) # detail car_detail = root.xpath('//div[@class="field detail"]')[0] info['car_id'] = x(car_detail, './/li[@class="cid"]/i/text()') info['birthday'] = x(car_detail, './/li[@class="yer"]/i/text()') info['mileage'] = x(car_detail, './/li[@class="dts"]/i/text()') info['color'] = x(car_detail, './/li[@class="clr"]/i/text()') # options car_options = root.xpath('//table[@class="option_table"]//span[@class="check"]') info['options'] = {x(option, './/a/text()'): x(option, './/sup/@class')\ for option in car_options} # seller seller = root.xpath('//div[@class="field seller"]//dd[not(contains(@class, "image") or contains(@class, "email"))]') info['seller'] = {x(s, './@class'): xte(s, './p/span/text()|./p/strong/text()|./p/strong/a/text()') for s in seller[:5]} # accidents accident_list = root.xpath('//ul[@class="acclist"]/li') info['accidents'] = {x(a, './b/text()'): x(a, './/strong/text()') for a in accident_list} # etc encar = {} encar['registration_date'] = x(root, '//div[@class="field etc"]//span[@class="date"]/text()').strip(' :').replace('/', '-') encar['page_hit'] = int(x(root, '//div[@class="field etc"]//span[@class="hit"]/text()').strip(': ')) encar['page_favs'] = int(x(root, '//div[@class="field etc"]//span[@class="hot"]//i/text()')) info['encar'] = encar return info
def get_sitting_urls(assembly_id, div_id, sessionurl): root = get.webpage(get.htmltree(sessionurl)) js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')] params = match_name_codes(js_calls, filter='mainsearch2', type='sessions') nsittings = len(params) params['j'] = str(nsittings) urls = [] for i in range(nsittings): params['SES_NUM'] = params['SES_NUM%s' % i] url = '%s&%s' % (sessionurl, urlencode(params)) # TODO: generalize me url = url.replace('con_search2', 'con_search3') urls.append({'session_name': params['SES_NUM'], 'url': url}) return urls
def parse_page(page_num, attrs): def save_pdf(data): filename = get_filename(data, 'pdf') urllib.urlretrieve(data['pdf'], filename) def save_json(data): filename = get_filename(data, 'json') with open(filename, 'w') as f: json.dump(data, f, indent=2) html = get_html(page_num) root = get.webpage(html) rows = root.xpath(\ '//table[@background="../img/main_boxback2.gif"]//tr')[2:-1] for row in rows: data = parse_row(row, attrs) save_json(data) save_pdf(data)
def get_session_urls(assembly_id, div_id, listurl): def searchform(root, num=''): return root.xpath('.//form[@name="searchform%s"]/@action' % num)[0] root = get.webpage(get.htmltree(listurl)) js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')] params = match_name_codes(js_calls, filter='mainsearch', type='committees') nsessions = len(params)/2 params['i'] = str(nsessions) params['div'] = str(div_id) params['DAE_NUM'] = str(assembly_id) urls = [] for i in range(nsessions): params['COMM_NAME'] = params['COMM_NAME%s' % i] params['COMM_CODE'] = params['COMM_CODE%s' % i] urls.append(\ {'committee': params['COMM_NAME'], 'url': '%s/content/%s?%s' %\ (BASEURL, searchform(root)[:-2], urlencode(params))}) return urls
def get_session_urls(assembly_id, div_id, listurl): def searchform(root, num=''): return root.xpath('.//form[@name="searchform%s"]/@action' % num)[0] root = get.webpage(get.htmltree(listurl)) js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')] params = match_name_codes(js_calls, filter='mainsearch', type='committees') nsessions = len(params) / 2 params['i'] = str(nsessions) params['div'] = str(div_id) params['DAE_NUM'] = str(assembly_id) urls = [] for i in range(nsessions): params['COMM_NAME'] = params['COMM_NAME%s' % i] params['COMM_CODE'] = params['COMM_CODE%s' % i] urls.append(\ {'committee': params['COMM_NAME'], 'url': '%s/content/%s?%s' %\ (BASEURL, searchform(root)[:-2], urlencode(params))}) return urls
def get_doc_ids(assembly_id, div_id, sittingurl): root = get.webpage(get.htmltree(sittingurl)) js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')] return [{'sitting_name': c[1][0], 'docid': c[1][1]}\ for c in js_calls if c[0]=='mainsearch4']
def get_car_info(encar_id): info = dict(encar_id=encar_id) url = settings.car_baseurl % encar_id f = get.htmltree(url) root = get.webpage(f) # summary summary = root.xpath('//div[@class="section summary hproduct"]')[0] info['name'] = [i.strip()\ for i in summary.xpath('.//h3[@class="car"]/span/text()')] info['transmission'] =\ x(summary, './/div[@class="short"]//li[@class="trs"]/text()') info['fuel'] = x(summary, './/div[@class="short"]//li[@class="fue"]/i/text()') info['engine'] = x(summary, './/div[@class="short"]//li[@class="eng"]/text()') info['type'] = x(summary, './/div[@class="short"]//li[@class="typ"]/text()') try: info['tags'] = x(summary, './/div[@class="merit"]//span/text()') except IndexError: info['tags'] = [] info['price'] = int(x(summary, './/div[@class="prc"]//strong/text()')) # detail car_detail = root.xpath('//div[@class="field detail"]')[0] info['car_id'] = x(car_detail, './/li[@class="cid"]/i/text()') info['birthday'] = x(car_detail, './/li[@class="yer"]/i/text()') info['mileage'] = x(car_detail, './/li[@class="dts"]/i/text()') info['color'] = x(car_detail, './/li[@class="clr"]/i/text()') # options car_options = root.xpath( '//table[@class="option_table"]//span[@class="check"]') info['options'] = {x(option, './/a/text()'): x(option, './/sup/@class')\ for option in car_options} # seller seller = root.xpath( '//div[@class="field seller"]//dd[not(contains(@class, "image") or contains(@class, "email"))]' ) info['seller'] = { x(s, './@class'): xte(s, './p/span/text()|./p/strong/text()|./p/strong/a/text()') for s in seller[:5] } # accidents accident_list = root.xpath('//ul[@class="acclist"]/li') info['accidents'] = { x(a, './b/text()'): x(a, './/strong/text()') for a in accident_list } # etc encar = {} encar['registration_date'] = x( root, '//div[@class="field etc"]//span[@class="date"]/text()').strip( ' :').replace('/', '-') encar['page_hit'] = int( x(root, '//div[@class="field etc"]//span[@class="hit"]/text()').strip(': ')) encar['page_favs'] = int( x(root, '//div[@class="field etc"]//span[@class="hot"]//i/text()')) info['encar'] = encar return info
def get_hidden_url(url): r = requests.get(url) html = r.text root = get.webpage(html) return '%s/%s' % (baseurl, root.xpath('//frame/@src')[1])