Beispiel #1
1
    def navi_info(text):
        # parse
        text = OrderListPage.crop(text)
        text = OrderListPage.massage(text)
        soup = BeautifulSoup(text)

        page_navigator_table = soup.table(id="tblNavigator")[0]
        current_page_anchor  = page_navigator_table.find('a', href=None)
        next_page_anchor     = current_page_anchor.findNextSibling('a')
        next_page_href = next_page_anchor["href"] if next_page_anchor else None
    
        navi_info = (current_page_anchor.string, next_page_href)
        logging.debug("current page: #%s, next page: %s" % navi_info)
        return navi_info
Beispiel #2
0
    def fetch_one(klass, document, cache=False, sync=False):
        soup = scraper.get(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id)
        document.full_details_url = soup.table.a["href"]
        # f*****g stupid hack because BeautifulSoup fails to parse correctly the html
        soup, suppe = scraper.lxml_get_with_nl(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id)
        table = BeautifulSoup(etree.tostring(soup.xpath('//table')[0], pretty_print=True))
        table_nl = BeautifulSoup(etree.tostring(suppe.xpath('//table')[0], pretty_print=True))
        dico = document_to_dico(list(table.table('tr', recursive=False)))
        dico_nl = document_to_dico(list(table_nl.table('tr', recursive=False)))

        del dico[""]
        del dico_nl[""]

        klass._get_first_level_data(dico, dico_nl, document)
        klass._get_in_charged_commissions(dico, dico_nl, document)
        klass._get_plenaries(dico, dico_nl, document)
        klass._get_senat_plenaries(dico, dico_nl, document)
        klass._get_competences(dico, dico_nl, document)
        klass._get_document_chambre(dico, dico_nl, document)
        klass._get_document_senat(dico, dico_nl, document)

        document.done = True
        document.save()
        logger.info("parsed document [%s] %s" % (document.lachambre_id, document.title["fr"]))
        dico.die_if_got_not_accessed_keys()
def index(url):
    soup = BeautifulSoup(getRequest(url))
    if 'truck_' in url:
        try:
            items = soup('div', attrs={'class': "nscrRow"})
            if len(items) < 1:
                raise
            for i in items:
                name = i('div')[2].string + ' - ' + i(
                    'div')[0].string + ' - ' + i('div')[1].string
                addLink(name, 'url', 4, os.path.join(home, 'icon.png'), False)
        except:
            print ' Looks Live '
            getVideoLinks(page_truck + 'config.xml')
    if 'nns_' in url:
        try:
            items = soup.table('tr')
            if len(items) < 1:
                raise
            for i in items:
                name = i('td')[0].string + ' - ' + i(
                    'td')[2].string + ' - ' + i('td')[1].string + ' - ' + i(
                        'td')[3].string
                addLink(name, 'url', 4, os.path.join(home, 'icon.png'), False)
        except:
            getVideoLinks(page_nns + 'config.xml')
    if 'race_' in url:
        try:
            image = soup('div', attrs={'id':
                                       "nscrHeaderWrapper"})[0].img['src']
            xbmc.executebuiltin(
                "XBMC.Notification(RaceBuddy,No schedule yet - Coverage expected to start in August,15000,"
                + icon + ")")
        except:
            getVideoLinks(page_cup + 'config.xml')
def index(url):
        soup = BeautifulSoup(getRequest(url))
        if 'truck_' in url:
            try:
                items = soup('div', attrs={'class' : "nscrRow"})
                if len(items) < 1:
                    raise
                for i in items:
                    name = i('div')[2].string +' - '+ i('div')[0].string +' - '+ i('div')[1].string
                    addLink(name,'url',4,os.path.join(home, 'icon.png'), False)
            except:
                print ' Looks Live '
                getVideoLinks(page_truck+'config.xml')
        if 'nns_' in url:
            try:
                items = soup.table('tr')
                if len(items) < 1:
                    raise
                for i in items:
                    name = i('td')[0].string +' - '+ i('td')[2].string +' - '+ i('td')[1].string +' - '+i('td')[3].string
                    addLink(name,'url',4,os.path.join(home, 'icon.png'), False)
            except:
                getVideoLinks(page_nns+'config.xml')
        if 'race_' in url:
            try:
                image = soup('div', attrs={'id' : "nscrHeaderWrapper"})[0].img['src']
                xbmc.executebuiltin("XBMC.Notification(RaceBuddy,No schedule yet - Coverage expected to start in August,15000,"+icon+")")
            except:
                getVideoLinks(page_cup+'config.xml')
Beispiel #5
0
def parse_order_page(text):
    start_pattern = '''<div id="ordList"'''
    end_pattern   = '''<script language='JavaScript'>'''
    start_idx = text.find(start_pattern)
    end_idx   = text[start_idx:].find(end_pattern) + start_idx
    if end_idx == start_idx -1:
        raise Exception('cannot find end pattern from %s: %s' % (order_url, end_pattern))
    text = text[start_idx:end_idx]
    # massage
    text = text.replace('''<a style="cursor:hand";''', '''<a style="cursor:hand"''')
    # parse
    soup = BeautifulSoup(text)
    order_list_table     = soup.table(id="MyOrderListTbl")[0]
    page_navigator_table = soup.table(id="tblNavigator")[0]
    # navigation
    current_page_anchor = page_navigator_table.find('a', href=None)
    next_page_anchor = current_page_anchor.findNextSibling('a')
    print 'current page:', current_page_anchor.string
    print 'next page:', next_page_anchor["href"]
    navi_info = (current_page_anchor.string, next_page_anchor["href"])
    # order list
    orders = []
    remove_bogus_rows = lambda tag: tag.name == u'tr' and len(tag.findAll('td')) != 1
    remove_bogus_cell = lambda tag: tag.name == u'td' and tag['width'] != u'1'
    for tr in order_list_table.find('tr').findNextSiblings(remove_bogus_rows):
        tds = tr.findAll(remove_bogus_cell)
        order_id = tds[0].b.string
        order_detail_link = get_order_detail_link(order_id)
        order_date = tds[1].string
        order_name = tds[2].span.string
        order_price = tds[3].b.string
        pkg_num = tds[3].b.string.next.rsplit('/')[-1]
        deliver_state_link = get_deliver_state_link(order_id)
        if not str(order_date).startswith(target_month):
            continue
        #print '-', order_date, order_id, order_name, order_price, pkg_num, deliver_state_link
        #print '[%s] %s 에 %s원치(%s개)를 샀습니다: %s' % (order_id, order_date, order_price, pkg_num, order_name)
        orders.append( (order_id, order_date, order_price, pkg_num, order_name) )
    return (orders, navi_info)
Beispiel #6
0
    def order_info(text, target_month):
        ''' parse order list page and return list of Order objects '''

        # parse
        text = OrderListPage.crop(text)
        text = OrderListPage.massage(text)
        soup = BeautifulSoup(text)
    
        # order list
        orders = []
        order_list_table  = soup.table(id="MyOrderListTbl")[0]
        remove_bogus_rows = lambda tag: tag.name == u'tr' and len(tag.findAll('td')) != 1
        for tr in order_list_table.find('tr').findNextSiblings(remove_bogus_rows):
            order = Order.build_from_order_list_page(tr, target_month)
            if order is None:
                continue
            orders.append( order )

        logging.debug("%d orders total." % len(orders))
        return orders