Python Selector.xpath Beispiele

Programmiersprache: Python

Namespace / Paketname: selector

Klasse / Typ: Selector

Methode / Funktion: xpath

Beispiele auf hotexamples.com: 4

Python Selector.xpath - 4 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die selector.Selector.xpath, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Selector(30)

select(4)

xpath(3)

setStimuliLock(3)

pop(3)

default(2)

stop(2)

push(2)

result(2)

extend(2)

add(2)

execute(2)

takeStimuliActivateTile(1)

select_edges_for_improvement(1)

write(1)

process(1)

term_path(1)

remove(1)

term_meaning_path(1)

run_forever(1)

scrape(1)

validate(1)

select_on(1)

status405(1)

selector(1)

selectorLogic(1)

term_detail_path(1)

term_path_jp(1)

takeStimuliStandBy(1)

place(1)

set_relaxation(1)

shuffle(1)

start(1)

status404(1)

update(1)

Run(1)

match(1)

click(1)

__init__(1)

_parent(1)

add_road(1)

apex_seasons(1)

apply(1)

are_crossings_hidden(1)

are_kerb_lines_hidden(1)

are_polylines_hidden(1)

build_pipeline(1)

cancelAllSelection(1)

catalog_container(1)

Beispiel #1

Datei anzeigen

Datei: parser_vs.py Projekt: livekn/hexathel

    def parse(self, url, html):
        sel = Selector(html)
        link = sel.xpath('//div[@class="menu_block"]/ul/li/a/@href')

        tmp = []

        for u in link:
            if (u.find('www.gohappy.com.tw') != -1):
                tmp.append(u)
            else:
                tmp.append('http://www.gohappy.com.tw' + u)

        return True, tmp

Beispiel #2

Datei anzeigen

Datei: parser_vs.py Projekt: imdedr/hexathel

    def parse( self, url, html ):
        sel = Selector( html )
        link = sel.xpath('//div[@class="menu_block"]/ul/li/a/@href')

        tmp = []

        for u in link:
            if( u.find('www.gohappy.com.tw') != -1 ):
                tmp.append(u)
            else:
                tmp.append('http://www.gohappy.com.tw' + u)

        return True, tmp

Beispiel #3

Datei anzeigen

    def parse( self, url, html ):

        # This site has a Sitemap so parser will use Sitemap
        # The Url http://www.gohappy.com.tw/ is a standard start key url

        if( url == 'http://www.gohappy.com.tw/' ):

            # This Page will redriect to sitemap
            if( self.test_flag ):
                print 'Hihi'
                print self.config['name']
                return False, []

            return True, ['http://www.gohappy.com.tw/intro/sitemap.html']

        elif( url == 'http://www.gohappy.com.tw/intro/sitemap.html' ):

            #Parse Sitemap
            sel = Selector( html )
            block = sel.xpath('//div[@class="sitemap_group"]')

            #remove some Block

            tmp = [] # Create a tmp Url List

            for __block in block:
                #print __block.xpath('dl/dt/a/@title')[0]
                level1 = __block.xpath('dl/dd/ul/li/a/@href')
                level2 = __block.xpath('dl/dd/div/ul/li/a/@href')

                for u in level1:
                    if ( u != '' ):
                        if( u.find('www.gohappy.com.tw') != -1 ):
                            tmp.append(u)
                        else:
                            tmp.append( 'http://www.gohappy.com.tw' + u )

                for u in level2:
                    if ( u != '' ):
                        if( u.find('www.gohappy.com.tw') != -1 ):
                            tmp.append(u)
                        else:
                            tmp.append( 'http://www.gohappy.com.tw' + u )

            return True, tmp
            
        else:
            return False, []

Beispiel #4

Datei anzeigen

Datei: parser_vc.py Projekt: livekn/hexathel

    def parse(self, url, html):
        sel = Selector(html)

        tmp = []

        # If Catalog 是 點數專區則直接拋棄
        param = urlhelper.urlParamParse(url)
        if (param['sid'] == '14'):
            return True, []

        #Key Word : 更多商品 請由左側目錄點選
        if (html.find('更多商品 請由左側目錄點選') != -1):

            #取的左側連接
            link = sel.xpath('//*[@id="block_left"]/div[1]/ul/li/a/@href')
            for i in link:
                if (i.find('www.gohappy.com.tw') != -1):
                    tmp.append(i)
                else:
                    tmp.append('http://www.gohappy.com.tw' + i)

        else:

            # Get Product Item
            product_list = sel.xpath('//ul[@class="product_list"]/li')

            #crumb
            path = sel.xpath('//ul[@id="path"]/li')

            print len(path)

            crumb = []

            for i in xrange(1, len(path)):

                href = path[i].xpath('span/a/@href')
                if (href == []):
                    href = path[i].xpath('a/@href')[0]
                else:
                    href = path[i].xpath('span/a/@href')[0]

                title = path[i].xpath('span/a/text()')
                if (title == []):
                    title = path[i].xpath('a/text()')[0]
                else:
                    title = path[i].xpath('span/a/text()')[0]

                if (href != '#'):
                    id_list = urlhelper.urlParamParse(href)

                    cid = -1

                    if ('cid' in id_list):
                        cid = id_list['cid']
                    else:
                        cid = id_list['sid']

                    crumb.append({'id': cid, 'name': title})

            data = dict()
            data['website_code'] = 'gohappy'
            data['1st_price_name'] = '特惠價'
            data['timestamp'] = datetime.datetime.utcnow()
            data['catalog'] = crumb

            for item in product_list:
                data['pic_url'] = item.xpath('p[1]/a/img/@src')[0].encode(
                    'utf-8')
                data['goods_name'] = item.xpath('p[1]/a/img/@title')[0]
                data['1st_price'] = item.xpath(
                    'p//span[@class="price"]/text()')[0]
                data['goods_url'] = 'http://www.gohappy.com.tw' + item.xpath(
                    'p[1]/a/@href')[0]
                data['goods_original_id'] = urlhelper.urlParamParse(
                    data['goods_url'])['pid']

                print "Download %s" % data['goods_original_id']
                try:
                    urllib.urlretrieve(
                        data['pic_url'].replace('_2', '_4'),
                        '/root/crawler/gohappy/img/%s.jpg' %
                        data['goods_original_id'])
                except:
                    try:
                        urllib.urlretrieve(
                            data['pic_url'].replace('_2', '_3_1'),
                            '/root/crawler/gohappy/img/%s.jpg' %
                            data['goods_original_id'])
                    except:
                        try:
                            urllib.urlretrieve(
                                data['pic_url'],
                                '/root/crawler/gohappy/img/%s.jpg' %
                                data['goods_original_id'])
                        except:
                            print 'Reset by Peer'

                data['local_pic'] = 'img/%s.jpg' % data['goods_original_id']

                # Write back to MongoDB
                data.pop('_id', None)
                self.db.gohappy.goods.insert(data)

            # Check Next Page
            current_page = 1
            param = urlhelper.urlParamParse(url)
            if ('cp' in param):
                current_page = int(param['cp'])
            else:
                url += '&cp=1'

            #Get Last Page Url
            next_page = sel.xpath('//p[@class="page_number"]/a/@href')
            if (len(next_page) != 0):
                next_page = urlhelper.urlParamParse(next_page[len(next_page) -
                                                              1])['cp']
                next_page = int(next_page)
            else:
                next_page = 1

            if (next_page > current_page):
                print 'Next Page'
                tmp.append(
                    url.replace('cp=' + str(current_page),
                                'cp=' + str(next_page)))

        time.sleep(1)

        return True, tmp