예제 #1
0
    def parse_one_top(self, response):

        logger.info('fetch : ' + response.url)

        img = f_xpath(response,
                      '//div[contains(@class, "inner-main-content")]')

        meta = {}
        meta['name'] = fx_extract(img, './div/h3/text()').strip().strip('#')
        meta['img'] = fx_extract(img, './/div[@class="inner-image"]/img/@src')
        meta['key'] = meta['img'][self.prefix_len:]
        meta['from'] = fx_extract(img, './/div[@class="inner-image"]/a/@href')
        meta['desc'] = fx_extract(img, './div/p/text()')

        curr_meta = response.meta
        curr_meta['top'].append(meta)

        nexturl = self.next_top_img(response)
        if nexturl:
            yield scrapy.http.Request(url=nexturl,
                                      callback=self.parse_one_top,
                                      meta=curr_meta,
                                      dont_filter=True)
        else:
            cli = authedclient()
            cli.upload_data(curr_meta['key'],
                            json.dumps(curr_meta),
                            headers={'Content-Type': 'text/json'})
            logger.info('upload : ' + curr_meta['key'])

            for meta in curr_meta['top']:
                put_file_from_url(cli, meta['key'], meta['img'])
                logger.info('upload : %s from %s' % (meta['key'], meta['img']))
예제 #2
0
    def parse_one_top( self, response ):

        logger.info( 'fetch : ' + response.url )

        img = f_xpath( response, '//div[contains(@class, "inner-main-content")]' )

        meta = {}
        meta[ 'name' ] = fx_extract( img, './div/h3/text()' ).strip().strip('#')
        meta[ 'img'  ] = fx_extract( img, './/div[@class="inner-image"]/img/@src' )
        meta[ 'key'  ] = meta[ 'img' ][ self.prefix_len: ]
        meta[ 'from' ] = fx_extract( img, './/div[@class="inner-image"]/a/@href' )
        meta[ 'desc' ] = fx_extract( img, './div/p/text()' )

        curr_meta = response.meta
        curr_meta[ 'top' ].append( meta )

        nexturl = self.next_top_img( response )
        if nexturl:
            yield scrapy.http.Request( url = nexturl,
                                       callback = self.parse_one_top,
                                       meta = curr_meta,
                                       dont_filter = True )
        else:
            cli = authedclient()
            cli.upload_data( curr_meta[ 'key' ], json.dumps( curr_meta ),
                        headers = { 'Content-Type' : 'text/json' } )
            logger.info( 'upload : ' + curr_meta[ 'key' ] )

            for meta in curr_meta[ 'top' ]:
                put_file_from_url( cli, meta[ 'key' ], meta[ 'img' ] )
                logger.info( 'upload : %s from %s' % ( meta[ 'key' ], meta[ 'img' ] ) )
예제 #3
0
    def _parse_callout(self, p):
        callout = f_xpath( p, './div[@class="callout"]' )

        pct_off = ''
        if callout:
            pct = fx_extract( callout, './span[@class="pct"]/text()' ) or ''
            off = fx_extract( callout, './span[@class="off"]/text()' ) or ''
            pct_off = (pct + ' ' + off).strip()

        return pct_off
예제 #4
0
    def next_page( self, response ):

        nexturl = f_xpath( response, '//table[@class="CMheadingBar"]/tr/td[1]/div/span' )
        if nexturl is None:
            return None

        return fx_extract( nexturl, './a[contains(text(), "Next")]/@href' )
예제 #5
0
    def _parse_price(self, p):
        price = f_xpath( p, './div[@class="product-price"]' )
        p_c = fx_extract( price, './/span[@itemprop="priceCurrency"]/@content' ) or ''
        p_p = fx_extract( price, './/span[@class="price"]/@content' ) or '-1'
        p_r = fx_extract( price, './/span[@class="regular-price"]/text()' ) or ''
        p_u = fx_extract( price, './/span[@itemprop="priceValidUntil"]/@content' ) or ''
        try:
            float( p_p )
        except Exception:
            p_p = '-1'

        if p_c == 'USD':
            if '$' in p_r:
                p_r = p_r.split('$')[1].split()[0]

        return p_c, p_p, p_r, p_u
예제 #6
0
    def _parse_date(self, response):
        date = f_xpath( response, '//div[contains(@class, "offer-pdp-hd") ' + \
                                   ' and contains(@class, "clearfix")]' )
        starts = ''
        expiration = ''
        if not date:
            return starts, expiration

        st = f_xpath( date, './/div[@class="offer-right"]' )
        if st:
            starts = fx_extract( st, './h2/text()' ) or ''

        exp = f_xpath( date, './/div[contains(@class, "expiration") ' + \
                               ' and contains(@class, "with-date") ]' )
        if exp:
            expiration = fx_extract( exp, './div/text()' ) or ''
        return starts, expiration
예제 #7
0
    def next_page(self, response):

        nexturl = f_xpath(response,
                          '//table[@class="CMheadingBar"]/tr/td[1]/div/span')
        if nexturl is None:
            return None

        return fx_extract(nexturl, './a[contains(text(), "Next")]/@href')
예제 #8
0
 def next_top_img(self, response):
     urls = f_xpath(
         response,
         '//div[contains(@style, "margin-bottom")]').xpath('./div')[1:]
     for url in urls:
         nxt = fx_extract(url, './a[contains(text(), "Next")]/@href')
         if nxt:
             return nxt
예제 #9
0
    def parse_num_help_review( self, div ):

        # text : 3 of 3 people found the following review helpful

        text = fx_extract( div, './text()' )
        if text is None:
            return None

        text = text.strip().split()
        return ( int( text[ 0 ] ), int( text[ 2 ] ) )
예제 #10
0
    def store_next_page( self, response ):
        nexturl = f_xpath( response, '//div[@class="pagination"]/span[@class="next"]' )
        if nexturl is None:
            return None

        uri = fx_extract( nexturl, './a[contains(text(), "Next")]/@href' )
        if not uri:
            return None

        return self.rooturl + uri
예제 #11
0
    def parse_num_help_review(self, div):

        # text : 3 of 3 people found the following review helpful

        text = fx_extract(div, './text()')
        if text is None:
            return None

        text = text.strip().split()
        return (int(text[0]), int(text[2]))
예제 #12
0
    def _parse_star(self, div):

        # text : 5.0 out of 5 stars

        star = fx_extract(div, './span[1]/span/span/text()')
        if star is None:
            return None
        else:
            star = star.strip().split()
            star = (float(star[0]), float(star[3]))

            return star
예제 #13
0
    def _parse_star( self, div ):

        # text : 5.0 out of 5 stars

        star = fx_extract( div, './span[1]/span/span/text()' )
        if star is None:
            return None
        else:
            star = star.strip().split()
            star = ( float( star[ 0 ] ), float( star[ 3 ] ) )

            return star
예제 #14
0
    def parse(self, response):
        #with open( '/tmp/findnsave_sales.html', 'w' ) as f:
        #    f.write( response.body )

        logger.info( 'fetch : ' + response.url )
        sales = f_xpath( response, '//ul[contains(@class, "listing") ' + \
                                   ' and contains(@class, "retailer-detail")' + \
                                   ' and contains(@class, "infinite")]' ).xpath(
                                './li[starts-with(@id, "offer-")]' )
        for s in sales:
            s = f_xpath( s, './div' ).xpath( './a' )
            id = fx_extract( s, './@data-offer-id' )
            href = fx_extract( s, './@href' )
            th_img = fx_extract( s, './img/@src' )

            if not ( id and href and th_img ):
                continue

            # TODO : id if in db continue

            if not href.startswith( 'http://' ):
                href = self.rooturl + href

            meta = { 'id' : id,
                     'href' : href,
                     'th_img' : th_img }

            yield scrapy.http.Request( url = href,
                                       callback = self.parse_one_sale,
                                       meta = meta,
                                       dont_filter = True )


        next_url = self.store_next_page( response )
        if next_url is None:
            return

        yield scrapy.http.Request( url = next_url, callback = self.parse,
                                   dont_filter = True )
예제 #15
0
    def parse(self, response):

        logger.info( 'fetch : ' + response.url )
        states = f_xpath( response, '//select[@id="states-dropdown"]' ).xpath( './option' )

        sts = {}
        for st in states:
            st_short = fx_extract( st, './@value' )
            st_name = fx_extract( st, './text()' )

            if not st_short:
                continue

            if st_short not in sts:
                sts[ st_short ] = st_name

        states = xpath( response, '//ul[contains(@class, "hide") ' + \
                                  ' and contains(@class, "clearfix")]' )

        #state_fd = open( '/tmp/state_url.csv', 'w' )
        #csvw = csv.writer( state_fd )
        for st in states:
            st_short = fx_extract( st, './@id' )
            locs = st.xpath( './li' )
            for loc in locs:
                url = fx_extract( loc, './a/@href' )
                area = fx_extract( loc, './a/text()' )
                #csvw.writerow( [ st_short, sts.get( st_short, '' ), area, url ] )

                if st_short not in sts:
                    continue

                d = FindnsaveAreaItem()
                d[ 'area'  ] = area
                d[ 'short' ] = st_short
                d[ 'state' ] = sts[ st_short ]
                d[ 'url'   ] = url

                yield d
예제 #16
0
    def parse(self, response):

        logger.info('fetch : ' + response.url)
        catgos = f_xpath( response, '//ul[contains(@class, "listing") ' + \
                                    ' and contains(@class, "grouping")' + \
                                    ' and contains(@class, "infinite")]' ).xpath( './li' )

        for ctg in catgos:
            ctg = f_xpath(ctg, './/div[@class="chiclet-actions"]/a')
            if not ctg:
                continue

            href = fx_extract(ctg, './@href')
            name = fx_extract(ctg, './@title')
            name = self.parse_categorie_name(name)

            try:
                _c, cid, id = href.strip('/').split('/')
            except:
                continue

            #csv.writer( self.csv_fd ).writerow( [ id, cid, name, href ] )

            d = FindnsaveCategoryItem()
            d['id'] = id
            d['name'] = name
            d['nameid'] = cid
            d['uri'] = href

            yield d

        next_url = self.categorie_next_page(response)
        if next_url is None:
            return

        yield scrapy.http.Request(url=next_url,
                                  callback=self.parse,
                                  dont_filter=True)
예제 #17
0
    def parse(self, response):

        logger.info( 'fetch : ' + response.url )
        tops = f_xpath( response, '//ul[contains(@class, "thumbnails")]' ).xpath( './li' )

        for top in tops:
            top = f_xpath( top, './div[@class="thumbnail"]' )
            if not top:
                continue

            name = fx_extract( top, './p/strong/text()' )
            href = fx_extract( top, './a/@href' ).strip()

            curr_meta = {}
            curr_meta[ 'name' ] = name
            curr_meta[ 'url' ] = href
            curr_meta[ 'key' ] = 'meta/' + href[ self.prefix_len: ] + '.json'
            curr_meta[ 'top' ] = []

            yield scrapy.http.Request( url = href,
                                       callback = self.parse_one_top,
                                       meta = curr_meta,
                                       dont_filter = True )
예제 #18
0
    def parse(self, response):

        logger.info('fetch : ' + response.url)
        brands = f_xpath( response, '//ul[contains(@class, "brands") ' + \
                                    ' and contains(@class, "columnize")' + \
                                    ' and contains(@class, "clearfix")]' ).xpath( './li' )

        for br in brands:
            br = f_xpath(br, './a')
            if not br:
                continue

            href = fx_extract(br, './@href')
            name = fx_extract(br, './text()')

            try:
                _b, bid, id = href.strip('/').split('/')
            except:
                continue

            #csv.writer( self.csv_fd ).writerow( [ id, bid, name, href ] )

            d = FindnsaveBrandItem()
            d['id'] = id
            d['name'] = escape(name)
            d['nameid'] = bid
            d['uri'] = href

            yield d

        next_url = self.brand_next_page(response)
        if next_url is None:
            return

        yield scrapy.http.Request(url=next_url,
                                  callback=self.parse,
                                  dont_filter=True)
    def parse(self, response):

        logger.info( 'fetch : ' + response.url )
        catgos = f_xpath( response, '//ul[contains(@class, "listing") ' + \
                                    ' and contains(@class, "grouping")' + \
                                    ' and contains(@class, "infinite")]' ).xpath( './li' )

        for ctg in catgos:
            ctg = f_xpath( ctg, './/div[@class="chiclet-actions"]/a' )
            if not ctg:
                continue

            href = fx_extract( ctg, './@href' )
            name = fx_extract( ctg, './@title' )
            name = self.parse_categorie_name( name )

            try:
                _c, cid, id = href.strip( '/' ).split( '/' )
            except:
                continue

            #csv.writer( self.csv_fd ).writerow( [ id, cid, name, href ] )

            d = FindnsaveCategoryItem()
            d['id'] = id
            d['name'] = name
            d['nameid'] = cid
            d['uri'] = href

            yield d

        next_url = self.categorie_next_page( response )
        if next_url is None:
            return

        yield scrapy.http.Request( url = next_url, callback = self.parse,
                                   dont_filter = True )
예제 #20
0
    def parse_one_sale(self, response):
        #with open( '/tmp/findnsave_sales_one.html', 'w' ) as f:
        #    f.write( response.body )

        sale = f_xpath( response, '//div[contains(@class, "offer-description-wrapper") ' + \
                                   ' and contains(@class, "clearfix")]' )
        if not sale:
            return

        starts, expiration = self._parse_date( response )
        pct_off = self._parse_callout( sale )
        lg_img = self._parse_large_img( sale )

        sr = f_xpath( sale, './div[@class="offer-right"]' )
        name = fx_extract( sr, './h1[@itemprop="name"]/text()' )
        if name is None:
            logger.debug( 'not crawl name in : ' + response.url )
            return

        p_c, p_p, p_r, p_u = self._parse_price( sr )
        desc = self._parse_desc( sr )
        retailer, category, brand = self._parse_retailer_category_brand( sr )

        data = [ response.meta[ 'id' ], name,
                    p_c, p_p, p_r, p_u, pct_off,
                    starts, expiration,
                    retailer, category, brand,
                    response.url, response.meta[ 'th_img' ], lg_img,
                    desc, ]

        d = FindnsaveSaleItem()
        d['area'] = 'newyork'
        d['id'] = response.meta[ 'id' ]
        d['name'] = escape(name)
        d['priceCurrency'] = p_c
        d['price'] = p_p
        d['priceRegular'] = p_r
        d['priceUtilDate'] = p_u
        d['priceOff'] = pct_off
        d['retailer'] = escape(retailer)
        d['category'] = escape(category)
        d['brand'] = escape(brand)
        d['desc'] = escape(desc)

        yield d

        #self.jsonfile.write( json.dumps( data ) + '\n' )
        logger.info( 'crawl : `' + name + '` OK' )
        return
예제 #21
0
    def parse(self, response):

        logger.info('fetch : ' + response.url)
        tops = f_xpath(response,
                       '//ul[contains(@class, "thumbnails")]').xpath('./li')

        for top in tops:
            top = f_xpath(top, './div[@class="thumbnail"]')
            if not top:
                continue

            name = fx_extract(top, './p/strong/text()')
            href = fx_extract(top, './a/@href').strip()

            curr_meta = {}
            curr_meta['name'] = name
            curr_meta['url'] = href
            curr_meta['key'] = 'meta/' + href[self.prefix_len:] + '.json'
            curr_meta['top'] = []

            yield scrapy.http.Request(url=href,
                                      callback=self.parse_one_top,
                                      meta=curr_meta,
                                      dont_filter=True)
예제 #22
0
    def parse(self, response):

        logger.info( 'fetch : ' + response.url )
        brands = f_xpath( response, '//ul[contains(@class, "brands") ' + \
                                    ' and contains(@class, "columnize")' + \
                                    ' and contains(@class, "clearfix")]' ).xpath( './li' )

        for br in brands:
            br = f_xpath( br, './a' )
            if not br:
                continue

            href = fx_extract( br, './@href' )
            name = fx_extract( br, './text()' )

            try:
                _b, bid, id = href.strip( '/' ).split( '/' )
            except:
                continue

            #csv.writer( self.csv_fd ).writerow( [ id, bid, name, href ] )

            d = FindnsaveBrandItem()
            d['id'] = id
            d['name'] = escape(name)
            d['nameid'] = bid
            d['uri'] = href

            yield d

        next_url = self.brand_next_page( response )
        if next_url is None:
            return

        yield scrapy.http.Request( url = next_url, callback = self.parse,
                                   dont_filter = True )
예제 #23
0
 def parse_from(self, div):
     return fx_extract(div, './b/text()') or ''
예제 #24
0
 def parse_reviewer_from( self, div ):
     return ( fx_extract( div, './div/div[2]/a/span/text()' ) or '',
              ( fx_extract( div, './div/div[2]/text()' ) or '' ).strip( ' -' ) )
예제 #25
0
 def parse_from( self, div ):
     return fx_extract( div, './b/text()' ) or ''
예제 #26
0
 def next_top_img( self, response ):
     urls = f_xpath( response, '//div[contains(@style, "margin-bottom")]' ).xpath( './div' )[1:]
     for url in urls:
         nxt = fx_extract( url, './a[contains(text(), "Next")]/@href' )
         if nxt:
             return nxt
예제 #27
0
 def parse_star_help_date( self, div ):
     return ( self._parse_star( div ),
              fx_extract( div, './span[2]/b/text()' ) or '',
              fx_extract( div, './span[2]/nobr/text()' ) or '' )
예제 #28
0
 def _parse_large_img(self, p):
     return fx_extract( p, './div[@class="offer-left"]' + \
                           '/div[contains(@class, "large")]' + \
                           '/img/@src' ) or ''
예제 #29
0
 def parse_star_help_date(self, div):
     return (self._parse_star(div), fx_extract(div, './span[2]/b/text()')
             or '', fx_extract(div, './span[2]/nobr/text()') or '')
예제 #30
0
    def _parse_retailer_category_brand(self, p):
        retailer = (fx_extract( p, './p[@class="retailer"]/a/text()' ) or '').strip()
        category = (fx_extract( p, './p[@class="parentCategory"]/a/text()' ) or '').strip()
        brand    = (fx_extract( p, './p[@class="brand"]/a/text()' ) or '').strip()

        return retailer, category, brand
예제 #31
0
 def parse_reviewer_from(self, div):
     return (fx_extract(div, './div/div[2]/a/span/text()')
             or '', (fx_extract(div, './div/div[2]/text()')
                     or '').strip(' -'))