def navi_info(text): # parse text = OrderListPage.crop(text) text = OrderListPage.massage(text) soup = BeautifulSoup(text) page_navigator_table = soup.table(id="tblNavigator")[0] current_page_anchor = page_navigator_table.find('a', href=None) next_page_anchor = current_page_anchor.findNextSibling('a') next_page_href = next_page_anchor["href"] if next_page_anchor else None navi_info = (current_page_anchor.string, next_page_href) logging.debug("current page: #%s, next page: %s" % navi_info) return navi_info
def fetch_one(klass, document, cache=False, sync=False): soup = scraper.get(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id) document.full_details_url = soup.table.a["href"] # f*****g stupid hack because BeautifulSoup fails to parse correctly the html soup, suppe = scraper.lxml_get_with_nl(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id) table = BeautifulSoup(etree.tostring(soup.xpath('//table')[0], pretty_print=True)) table_nl = BeautifulSoup(etree.tostring(suppe.xpath('//table')[0], pretty_print=True)) dico = document_to_dico(list(table.table('tr', recursive=False))) dico_nl = document_to_dico(list(table_nl.table('tr', recursive=False))) del dico[""] del dico_nl[""] klass._get_first_level_data(dico, dico_nl, document) klass._get_in_charged_commissions(dico, dico_nl, document) klass._get_plenaries(dico, dico_nl, document) klass._get_senat_plenaries(dico, dico_nl, document) klass._get_competences(dico, dico_nl, document) klass._get_document_chambre(dico, dico_nl, document) klass._get_document_senat(dico, dico_nl, document) document.done = True document.save() logger.info("parsed document [%s] %s" % (document.lachambre_id, document.title["fr"])) dico.die_if_got_not_accessed_keys()
def index(url): soup = BeautifulSoup(getRequest(url)) if 'truck_' in url: try: items = soup('div', attrs={'class': "nscrRow"}) if len(items) < 1: raise for i in items: name = i('div')[2].string + ' - ' + i( 'div')[0].string + ' - ' + i('div')[1].string addLink(name, 'url', 4, os.path.join(home, 'icon.png'), False) except: print ' Looks Live ' getVideoLinks(page_truck + 'config.xml') if 'nns_' in url: try: items = soup.table('tr') if len(items) < 1: raise for i in items: name = i('td')[0].string + ' - ' + i( 'td')[2].string + ' - ' + i('td')[1].string + ' - ' + i( 'td')[3].string addLink(name, 'url', 4, os.path.join(home, 'icon.png'), False) except: getVideoLinks(page_nns + 'config.xml') if 'race_' in url: try: image = soup('div', attrs={'id': "nscrHeaderWrapper"})[0].img['src'] xbmc.executebuiltin( "XBMC.Notification(RaceBuddy,No schedule yet - Coverage expected to start in August,15000," + icon + ")") except: getVideoLinks(page_cup + 'config.xml')
def index(url): soup = BeautifulSoup(getRequest(url)) if 'truck_' in url: try: items = soup('div', attrs={'class' : "nscrRow"}) if len(items) < 1: raise for i in items: name = i('div')[2].string +' - '+ i('div')[0].string +' - '+ i('div')[1].string addLink(name,'url',4,os.path.join(home, 'icon.png'), False) except: print ' Looks Live ' getVideoLinks(page_truck+'config.xml') if 'nns_' in url: try: items = soup.table('tr') if len(items) < 1: raise for i in items: name = i('td')[0].string +' - '+ i('td')[2].string +' - '+ i('td')[1].string +' - '+i('td')[3].string addLink(name,'url',4,os.path.join(home, 'icon.png'), False) except: getVideoLinks(page_nns+'config.xml') if 'race_' in url: try: image = soup('div', attrs={'id' : "nscrHeaderWrapper"})[0].img['src'] xbmc.executebuiltin("XBMC.Notification(RaceBuddy,No schedule yet - Coverage expected to start in August,15000,"+icon+")") except: getVideoLinks(page_cup+'config.xml')
def parse_order_page(text): start_pattern = '''<div id="ordList"''' end_pattern = '''<script language='JavaScript'>''' start_idx = text.find(start_pattern) end_idx = text[start_idx:].find(end_pattern) + start_idx if end_idx == start_idx -1: raise Exception('cannot find end pattern from %s: %s' % (order_url, end_pattern)) text = text[start_idx:end_idx] # massage text = text.replace('''<a style="cursor:hand";''', '''<a style="cursor:hand"''') # parse soup = BeautifulSoup(text) order_list_table = soup.table(id="MyOrderListTbl")[0] page_navigator_table = soup.table(id="tblNavigator")[0] # navigation current_page_anchor = page_navigator_table.find('a', href=None) next_page_anchor = current_page_anchor.findNextSibling('a') print 'current page:', current_page_anchor.string print 'next page:', next_page_anchor["href"] navi_info = (current_page_anchor.string, next_page_anchor["href"]) # order list orders = [] remove_bogus_rows = lambda tag: tag.name == u'tr' and len(tag.findAll('td')) != 1 remove_bogus_cell = lambda tag: tag.name == u'td' and tag['width'] != u'1' for tr in order_list_table.find('tr').findNextSiblings(remove_bogus_rows): tds = tr.findAll(remove_bogus_cell) order_id = tds[0].b.string order_detail_link = get_order_detail_link(order_id) order_date = tds[1].string order_name = tds[2].span.string order_price = tds[3].b.string pkg_num = tds[3].b.string.next.rsplit('/')[-1] deliver_state_link = get_deliver_state_link(order_id) if not str(order_date).startswith(target_month): continue #print '-', order_date, order_id, order_name, order_price, pkg_num, deliver_state_link #print '[%s] %s 에 %s원치(%s개)를 샀습니다: %s' % (order_id, order_date, order_price, pkg_num, order_name) orders.append( (order_id, order_date, order_price, pkg_num, order_name) ) return (orders, navi_info)
def order_info(text, target_month): ''' parse order list page and return list of Order objects ''' # parse text = OrderListPage.crop(text) text = OrderListPage.massage(text) soup = BeautifulSoup(text) # order list orders = [] order_list_table = soup.table(id="MyOrderListTbl")[0] remove_bogus_rows = lambda tag: tag.name == u'tr' and len(tag.findAll('td')) != 1 for tr in order_list_table.find('tr').findNextSiblings(remove_bogus_rows): order = Order.build_from_order_list_page(tr, target_month) if order is None: continue orders.append( order ) logging.debug("%d orders total." % len(orders)) return orders