Ejemplo n.º 1
0
    def parse(self, url, html):
        sel = Selector(html)
        link = sel.xpath('//div[@class="menu_block"]/ul/li/a/@href')

        tmp = []

        for u in link:
            if (u.find('www.gohappy.com.tw') != -1):
                tmp.append(u)
            else:
                tmp.append('http://www.gohappy.com.tw' + u)

        return True, tmp
Ejemplo n.º 2
0
    def parse( self, url, html ):
        sel = Selector( html )
        link = sel.xpath('//div[@class="menu_block"]/ul/li/a/@href')

        tmp = []

        for u in link:
            if( u.find('www.gohappy.com.tw') != -1 ):
                tmp.append(u)
            else:
                tmp.append('http://www.gohappy.com.tw' + u)

        return True, tmp
Ejemplo n.º 3
0
    def parse( self, url, html ):

        # This site has a Sitemap so parser will use Sitemap
        # The Url http://www.gohappy.com.tw/ is a standard start key url

        if( url == 'http://www.gohappy.com.tw/' ):

            # This Page will redriect to sitemap
            if( self.test_flag ):
                print 'Hihi'
                print self.config['name']
                return False, []

            return True, ['http://www.gohappy.com.tw/intro/sitemap.html']

        elif( url == 'http://www.gohappy.com.tw/intro/sitemap.html' ):

            #Parse Sitemap
            sel = Selector( html )
            block = sel.xpath('//div[@class="sitemap_group"]')

            #remove some Block

            tmp = [] # Create a tmp Url List

            for __block in block:
                #print __block.xpath('dl/dt/a/@title')[0]
                level1 = __block.xpath('dl/dd/ul/li/a/@href')
                level2 = __block.xpath('dl/dd/div/ul/li/a/@href')

                for u in level1:
                    if ( u != '' ):
                        if( u.find('www.gohappy.com.tw') != -1 ):
                            tmp.append(u)
                        else:
                            tmp.append( 'http://www.gohappy.com.tw' + u )

                for u in level2:
                    if ( u != '' ):
                        if( u.find('www.gohappy.com.tw') != -1 ):
                            tmp.append(u)
                        else:
                            tmp.append( 'http://www.gohappy.com.tw' + u )

            return True, tmp
            
        else:
            return False, []
Ejemplo n.º 4
0
    def parse(self, url, html):
        sel = Selector(html)

        tmp = []

        # If Catalog 是 點數專區則直接拋棄
        param = urlhelper.urlParamParse(url)
        if (param['sid'] == '14'):
            return True, []

        #Key Word : 更多商品 請由左側目錄點選
        if (html.find('更多商品 請由左側目錄點選') != -1):

            #取的左側連接
            link = sel.xpath('//*[@id="block_left"]/div[1]/ul/li/a/@href')
            for i in link:
                if (i.find('www.gohappy.com.tw') != -1):
                    tmp.append(i)
                else:
                    tmp.append('http://www.gohappy.com.tw' + i)

        else:

            # Get Product Item
            product_list = sel.xpath('//ul[@class="product_list"]/li')

            #crumb
            path = sel.xpath('//ul[@id="path"]/li')

            print len(path)

            crumb = []

            for i in xrange(1, len(path)):

                href = path[i].xpath('span/a/@href')
                if (href == []):
                    href = path[i].xpath('a/@href')[0]
                else:
                    href = path[i].xpath('span/a/@href')[0]

                title = path[i].xpath('span/a/text()')
                if (title == []):
                    title = path[i].xpath('a/text()')[0]
                else:
                    title = path[i].xpath('span/a/text()')[0]

                if (href != '#'):
                    id_list = urlhelper.urlParamParse(href)

                    cid = -1

                    if ('cid' in id_list):
                        cid = id_list['cid']
                    else:
                        cid = id_list['sid']

                    crumb.append({'id': cid, 'name': title})

            data = dict()
            data['website_code'] = 'gohappy'
            data['1st_price_name'] = '特惠價'
            data['timestamp'] = datetime.datetime.utcnow()
            data['catalog'] = crumb

            for item in product_list:
                data['pic_url'] = item.xpath('p[1]/a/img/@src')[0].encode(
                    'utf-8')
                data['goods_name'] = item.xpath('p[1]/a/img/@title')[0]
                data['1st_price'] = item.xpath(
                    'p//span[@class="price"]/text()')[0]
                data['goods_url'] = 'http://www.gohappy.com.tw' + item.xpath(
                    'p[1]/a/@href')[0]
                data['goods_original_id'] = urlhelper.urlParamParse(
                    data['goods_url'])['pid']

                print "Download %s" % data['goods_original_id']
                try:
                    urllib.urlretrieve(
                        data['pic_url'].replace('_2', '_4'),
                        '/root/crawler/gohappy/img/%s.jpg' %
                        data['goods_original_id'])
                except:
                    try:
                        urllib.urlretrieve(
                            data['pic_url'].replace('_2', '_3_1'),
                            '/root/crawler/gohappy/img/%s.jpg' %
                            data['goods_original_id'])
                    except:
                        try:
                            urllib.urlretrieve(
                                data['pic_url'],
                                '/root/crawler/gohappy/img/%s.jpg' %
                                data['goods_original_id'])
                        except:
                            print 'Reset by Peer'

                data['local_pic'] = 'img/%s.jpg' % data['goods_original_id']

                # Write back to MongoDB
                data.pop('_id', None)
                self.db.gohappy.goods.insert(data)

            # Check Next Page
            current_page = 1
            param = urlhelper.urlParamParse(url)
            if ('cp' in param):
                current_page = int(param['cp'])
            else:
                url += '&cp=1'

            #Get Last Page Url
            next_page = sel.xpath('//p[@class="page_number"]/a/@href')
            if (len(next_page) != 0):
                next_page = urlhelper.urlParamParse(next_page[len(next_page) -
                                                              1])['cp']
                next_page = int(next_page)
            else:
                next_page = 1

            if (next_page > current_page):
                print 'Next Page'
                tmp.append(
                    url.replace('cp=' + str(current_page),
                                'cp=' + str(next_page)))

        time.sleep(1)

        return True, tmp