Esempio n. 1
0
def parse_exchange_page(page):
    logging.debug('Parsing exchange page')

    data = lxml.html.document_fromstring(page)
    rows = data.xpath('//div[contains(@class, "row main-row-w")]')
    
    last_size = 0
    were_new = False
    with db_session:
        for row in rows:
            club = row.getchildren()[0].xpath('.//a')[0]
            club_id = re.search('club(\d+)*', club.attrib['href']).group(1)
            public = Public.get(club_id=club_id)
            if public == None:
                name = club.text_content().strip()
                if not name: name = 'Noname'
                price = int(re.sub("[^0-9]", "",
                                   row.xpath('.//span[contains(@class, "js_placement_price")]')[0].text_content()))
                size, coverage = map(lambda x: int(re.sub("[^0-9]", "", x.text_content())), row.xpath('.//span[@class="num"]'))

                try:
                    public = Public(club_id=club_id, name=name, \
                                    size=size, coverage=coverage, \
                                    price=price)
                except Exception as e:
                    logging.error('club_id: {}, name: {}, size: {}, price: {}'.\
                                  format(club_id, name, size, price))
                    raise e

                were_new = True
            
            last_size = public.size

    return last_size, were_new
Esempio n. 2
0
def parse_exchange_page(page):
    logging.debug('Parsing exchange page')

    data = lxml.html.document_fromstring(page)
    public_names = data.xpath('//a[@class="exchange_ad_post_stats"]')
    
    def text2int(text):
        try:
            return int(text.replace(' ', ''))
        except:
            return 0
        
    last_size = 0
    were_new = False
    with db_session:
        for public_name in public_names:
            club_id = re.search('stats-(\d+)*', public_name.attrib['onclick']).group(1)
            public = Public.get(club_id=club_id)
            if public == None:
                cur_path = public_name.getparent().getnext()
                public_id = cur_path.attrib['href'].lstrip('/')
                name = cur_path.text if cur_path.text else 'Noname'
                cur_path = cur_path.getnext().getnext()
                category = cur_path.text
                cur_path = cur_path.getparent().getnext()
                size = text2int(cur_path.xpath('b')[0].text_content())
                cur_path = cur_path.getnext()
                coverage2 = cur_path.xpath('b')[0].text_content()
                coverage, coverage_day = map(text2int, coverage2.split('/'))
                cur_path = cur_path.getnext()
                price = text2int(cur_path.xpath('b')[0].text_content())

                try:
                    public = Public(club_id=club_id, public_id=public_id, name=name, \
                                    category=category, size=size, coverage=coverage, \
                                    coverage_day=coverage_day, price=price)
                except Exception as e:
                    logging.error('public_id: {0}, name: {1}, size: {2}, price: {3}'.\
                                  format(public_id, name, size, price))
                    raise e

                were_new = True
            
            last_size = public.size

    return last_size, were_new