def parse_one_sale(self, response):
        #with open( '/tmp/findnsave_sales_one.html', 'w' ) as f:
        #    f.write( response.body )

        sale = f_xpath( response, '//div[contains(@class, "offer-description-wrapper") ' + \
                                   ' and contains(@class, "clearfix")]' )
        if not sale:
            return

        starts, expiration = self._parse_date( response )
        pct_off = self._parse_callout( sale )
        lg_img = self._parse_large_img( sale )

        sr = f_xpath( sale, './div[@class="offer-right"]' )
        name = fx_extract( sr, './h1[@itemprop="name"]/text()' )
        if name is None:
            logger.debug( 'not crawl name in : ' + response.url )
            return

        p_c, p_p, p_r, p_u = self._parse_price( sr )
        desc = self._parse_desc( sr )
        retailer, category, brand = self._parse_retailer_category_brand( sr )

        data = [ response.meta[ 'id' ], name,
                    p_c, p_p, p_r, p_u, pct_off,
                    starts, expiration,
                    retailer, category, brand,
                    response.url, response.meta[ 'th_img' ], lg_img,
                    desc, ]

        d = FindnsaveSaleItem()
        d['area'] = 'newyork'
        d['id'] = response.meta[ 'id' ]
        d['name'] = escape(name)
        d['priceCurrency'] = p_c
        d['price'] = p_p
        d['priceRegular'] = p_r
        d['priceUtilDate'] = p_u
        d['priceOff'] = pct_off
        d['retailer'] = escape(retailer)
        d['category'] = escape(category)
        d['brand'] = escape(brand)
        d['desc'] = escape(desc)

        yield d

        #self.jsonfile.write( json.dumps( data ) + '\n' )
        logger.info( 'crawl : `' + name + '` OK' )
        return
Exemple #2
0
    def parse_one_top(self, response):

        logger.info('fetch : ' + response.url)

        img = f_xpath(response,
                      '//div[contains(@class, "inner-main-content")]')

        meta = {}
        meta['name'] = fx_extract(img, './div/h3/text()').strip().strip('#')
        meta['img'] = fx_extract(img, './/div[@class="inner-image"]/img/@src')
        meta['key'] = meta['img'][self.prefix_len:]
        meta['from'] = fx_extract(img, './/div[@class="inner-image"]/a/@href')
        meta['desc'] = fx_extract(img, './div/p/text()')

        curr_meta = response.meta
        curr_meta['top'].append(meta)

        nexturl = self.next_top_img(response)
        if nexturl:
            yield scrapy.http.Request(url=nexturl,
                                      callback=self.parse_one_top,
                                      meta=curr_meta,
                                      dont_filter=True)
        else:
            cli = authedclient()
            cli.upload_data(curr_meta['key'],
                            json.dumps(curr_meta),
                            headers={'Content-Type': 'text/json'})
            logger.info('upload : ' + curr_meta['key'])

            for meta in curr_meta['top']:
                put_file_from_url(cli, meta['key'], meta['img'])
                logger.info('upload : %s from %s' % (meta['key'], meta['img']))
Exemple #3
0
    def parse_one_top( self, response ):

        logger.info( 'fetch : ' + response.url )

        img = f_xpath( response, '//div[contains(@class, "inner-main-content")]' )

        meta = {}
        meta[ 'name' ] = fx_extract( img, './div/h3/text()' ).strip().strip('#')
        meta[ 'img'  ] = fx_extract( img, './/div[@class="inner-image"]/img/@src' )
        meta[ 'key'  ] = meta[ 'img' ][ self.prefix_len: ]
        meta[ 'from' ] = fx_extract( img, './/div[@class="inner-image"]/a/@href' )
        meta[ 'desc' ] = fx_extract( img, './div/p/text()' )

        curr_meta = response.meta
        curr_meta[ 'top' ].append( meta )

        nexturl = self.next_top_img( response )
        if nexturl:
            yield scrapy.http.Request( url = nexturl,
                                       callback = self.parse_one_top,
                                       meta = curr_meta,
                                       dont_filter = True )
        else:
            cli = authedclient()
            cli.upload_data( curr_meta[ 'key' ], json.dumps( curr_meta ),
                        headers = { 'Content-Type' : 'text/json' } )
            logger.info( 'upload : ' + curr_meta[ 'key' ] )

            for meta in curr_meta[ 'top' ]:
                put_file_from_url( cli, meta[ 'key' ], meta[ 'img' ] )
                logger.info( 'upload : %s from %s' % ( meta[ 'key' ], meta[ 'img' ] ) )
Exemple #4
0
    def next_page( self, response ):

        nexturl = f_xpath( response, '//table[@class="CMheadingBar"]/tr/td[1]/div/span' )
        if nexturl is None:
            return None

        return fx_extract( nexturl, './a[contains(text(), "Next")]/@href' )
    def _parse_date(self, response):
        date = f_xpath( response, '//div[contains(@class, "offer-pdp-hd") ' + \
                                   ' and contains(@class, "clearfix")]' )
        starts = ''
        expiration = ''
        if not date:
            return starts, expiration

        st = f_xpath( date, './/div[@class="offer-right"]' )
        if st:
            starts = fx_extract( st, './h2/text()' ) or ''

        exp = f_xpath( date, './/div[contains(@class, "expiration") ' + \
                               ' and contains(@class, "with-date") ]' )
        if exp:
            expiration = fx_extract( exp, './div/text()' ) or ''
        return starts, expiration
Exemple #6
0
 def next_top_img(self, response):
     urls = f_xpath(
         response,
         '//div[contains(@style, "margin-bottom")]').xpath('./div')[1:]
     for url in urls:
         nxt = fx_extract(url, './a[contains(text(), "Next")]/@href')
         if nxt:
             return nxt
Exemple #7
0
    def next_page(self, response):

        nexturl = f_xpath(response,
                          '//table[@class="CMheadingBar"]/tr/td[1]/div/span')
        if nexturl is None:
            return None

        return fx_extract(nexturl, './a[contains(text(), "Next")]/@href')
    def _parse_callout(self, p):
        callout = f_xpath( p, './div[@class="callout"]' )

        pct_off = ''
        if callout:
            pct = fx_extract( callout, './span[@class="pct"]/text()' ) or ''
            off = fx_extract( callout, './span[@class="off"]/text()' ) or ''
            pct_off = (pct + ' ' + off).strip()

        return pct_off
    def store_next_page( self, response ):
        nexturl = f_xpath( response, '//div[@class="pagination"]/span[@class="next"]' )
        if nexturl is None:
            return None

        uri = fx_extract( nexturl, './a[contains(text(), "Next")]/@href' )
        if not uri:
            return None

        return self.rooturl + uri
    def parse(self, response):
        #with open( '/tmp/findnsave_sales.html', 'w' ) as f:
        #    f.write( response.body )

        logger.info( 'fetch : ' + response.url )
        sales = f_xpath( response, '//ul[contains(@class, "listing") ' + \
                                   ' and contains(@class, "retailer-detail")' + \
                                   ' and contains(@class, "infinite")]' ).xpath(
                                './li[starts-with(@id, "offer-")]' )
        for s in sales:
            s = f_xpath( s, './div' ).xpath( './a' )
            id = fx_extract( s, './@data-offer-id' )
            href = fx_extract( s, './@href' )
            th_img = fx_extract( s, './img/@src' )

            if not ( id and href and th_img ):
                continue

            # TODO : id if in db continue

            if not href.startswith( 'http://' ):
                href = self.rooturl + href

            meta = { 'id' : id,
                     'href' : href,
                     'th_img' : th_img }

            yield scrapy.http.Request( url = href,
                                       callback = self.parse_one_sale,
                                       meta = meta,
                                       dont_filter = True )


        next_url = self.store_next_page( response )
        if next_url is None:
            return

        yield scrapy.http.Request( url = next_url, callback = self.parse,
                                   dont_filter = True )
Exemple #11
0
    def parse(self, response):

        logger.info('fetch : ' + response.url)
        catgos = f_xpath( response, '//ul[contains(@class, "listing") ' + \
                                    ' and contains(@class, "grouping")' + \
                                    ' and contains(@class, "infinite")]' ).xpath( './li' )

        for ctg in catgos:
            ctg = f_xpath(ctg, './/div[@class="chiclet-actions"]/a')
            if not ctg:
                continue

            href = fx_extract(ctg, './@href')
            name = fx_extract(ctg, './@title')
            name = self.parse_categorie_name(name)

            try:
                _c, cid, id = href.strip('/').split('/')
            except:
                continue

            #csv.writer( self.csv_fd ).writerow( [ id, cid, name, href ] )

            d = FindnsaveCategoryItem()
            d['id'] = id
            d['name'] = name
            d['nameid'] = cid
            d['uri'] = href

            yield d

        next_url = self.categorie_next_page(response)
        if next_url is None:
            return

        yield scrapy.http.Request(url=next_url,
                                  callback=self.parse,
                                  dont_filter=True)
Exemple #12
0
    def parse(self, response):

        logger.info( 'fetch : ' + response.url )
        tops = f_xpath( response, '//ul[contains(@class, "thumbnails")]' ).xpath( './li' )

        for top in tops:
            top = f_xpath( top, './div[@class="thumbnail"]' )
            if not top:
                continue

            name = fx_extract( top, './p/strong/text()' )
            href = fx_extract( top, './a/@href' ).strip()

            curr_meta = {}
            curr_meta[ 'name' ] = name
            curr_meta[ 'url' ] = href
            curr_meta[ 'key' ] = 'meta/' + href[ self.prefix_len: ] + '.json'
            curr_meta[ 'top' ] = []

            yield scrapy.http.Request( url = href,
                                       callback = self.parse_one_top,
                                       meta = curr_meta,
                                       dont_filter = True )
    def parse(self, response):

        logger.info('fetch : ' + response.url)
        brands = f_xpath( response, '//ul[contains(@class, "brands") ' + \
                                    ' and contains(@class, "columnize")' + \
                                    ' and contains(@class, "clearfix")]' ).xpath( './li' )

        for br in brands:
            br = f_xpath(br, './a')
            if not br:
                continue

            href = fx_extract(br, './@href')
            name = fx_extract(br, './text()')

            try:
                _b, bid, id = href.strip('/').split('/')
            except:
                continue

            #csv.writer( self.csv_fd ).writerow( [ id, bid, name, href ] )

            d = FindnsaveBrandItem()
            d['id'] = id
            d['name'] = escape(name)
            d['nameid'] = bid
            d['uri'] = href

            yield d

        next_url = self.brand_next_page(response)
        if next_url is None:
            return

        yield scrapy.http.Request(url=next_url,
                                  callback=self.parse,
                                  dont_filter=True)
    def parse(self, response):

        logger.info( 'fetch : ' + response.url )
        catgos = f_xpath( response, '//ul[contains(@class, "listing") ' + \
                                    ' and contains(@class, "grouping")' + \
                                    ' and contains(@class, "infinite")]' ).xpath( './li' )

        for ctg in catgos:
            ctg = f_xpath( ctg, './/div[@class="chiclet-actions"]/a' )
            if not ctg:
                continue

            href = fx_extract( ctg, './@href' )
            name = fx_extract( ctg, './@title' )
            name = self.parse_categorie_name( name )

            try:
                _c, cid, id = href.strip( '/' ).split( '/' )
            except:
                continue

            #csv.writer( self.csv_fd ).writerow( [ id, cid, name, href ] )

            d = FindnsaveCategoryItem()
            d['id'] = id
            d['name'] = name
            d['nameid'] = cid
            d['uri'] = href

            yield d

        next_url = self.categorie_next_page( response )
        if next_url is None:
            return

        yield scrapy.http.Request( url = next_url, callback = self.parse,
                                   dont_filter = True )
    def parse(self, response):

        logger.info( 'fetch : ' + response.url )
        brands = f_xpath( response, '//ul[contains(@class, "brands") ' + \
                                    ' and contains(@class, "columnize")' + \
                                    ' and contains(@class, "clearfix")]' ).xpath( './li' )

        for br in brands:
            br = f_xpath( br, './a' )
            if not br:
                continue

            href = fx_extract( br, './@href' )
            name = fx_extract( br, './text()' )

            try:
                _b, bid, id = href.strip( '/' ).split( '/' )
            except:
                continue

            #csv.writer( self.csv_fd ).writerow( [ id, bid, name, href ] )

            d = FindnsaveBrandItem()
            d['id'] = id
            d['name'] = escape(name)
            d['nameid'] = bid
            d['uri'] = href

            yield d

        next_url = self.brand_next_page( response )
        if next_url is None:
            return

        yield scrapy.http.Request( url = next_url, callback = self.parse,
                                   dont_filter = True )
Exemple #16
0
    def parse(self, response):

        logger.info('fetch : ' + response.url)
        tops = f_xpath(response,
                       '//ul[contains(@class, "thumbnails")]').xpath('./li')

        for top in tops:
            top = f_xpath(top, './div[@class="thumbnail"]')
            if not top:
                continue

            name = fx_extract(top, './p/strong/text()')
            href = fx_extract(top, './a/@href').strip()

            curr_meta = {}
            curr_meta['name'] = name
            curr_meta['url'] = href
            curr_meta['key'] = 'meta/' + href[self.prefix_len:] + '.json'
            curr_meta['top'] = []

            yield scrapy.http.Request(url=href,
                                      callback=self.parse_one_top,
                                      meta=curr_meta,
                                      dont_filter=True)
    def _parse_price(self, p):
        price = f_xpath( p, './div[@class="product-price"]' )
        p_c = fx_extract( price, './/span[@itemprop="priceCurrency"]/@content' ) or ''
        p_p = fx_extract( price, './/span[@class="price"]/@content' ) or '-1'
        p_r = fx_extract( price, './/span[@class="regular-price"]/text()' ) or ''
        p_u = fx_extract( price, './/span[@itemprop="priceValidUntil"]/@content' ) or ''
        try:
            float( p_p )
        except Exception:
            p_p = '-1'

        if p_c == 'USD':
            if '$' in p_r:
                p_r = p_r.split('$')[1].split()[0]

        return p_c, p_p, p_r, p_u
Exemple #18
0
    def parse(self, response):

        logger.info( 'fetch : ' + response.url )
        states = f_xpath( response, '//select[@id="states-dropdown"]' ).xpath( './option' )

        sts = {}
        for st in states:
            st_short = fx_extract( st, './@value' )
            st_name = fx_extract( st, './text()' )

            if not st_short:
                continue

            if st_short not in sts:
                sts[ st_short ] = st_name

        states = xpath( response, '//ul[contains(@class, "hide") ' + \
                                  ' and contains(@class, "clearfix")]' )

        #state_fd = open( '/tmp/state_url.csv', 'w' )
        #csvw = csv.writer( state_fd )
        for st in states:
            st_short = fx_extract( st, './@id' )
            locs = st.xpath( './li' )
            for loc in locs:
                url = fx_extract( loc, './a/@href' )
                area = fx_extract( loc, './a/text()' )
                #csvw.writerow( [ st_short, sts.get( st_short, '' ), area, url ] )

                if st_short not in sts:
                    continue

                d = FindnsaveAreaItem()
                d[ 'area'  ] = area
                d[ 'short' ] = st_short
                d[ 'state' ] = sts[ st_short ]
                d[ 'url'   ] = url

                yield d
    def _parse_desc(self, p):
        desc = f_xpath( p, './div[@class="offer-descriptions"]' )
        desc = ' '.join( [ x.strip() for x in \
                    xpath_extract( desc, './/div[@class="offer-description"]/text()' ) ] )

        return desc
Exemple #20
0
    def parse(self, response):
        #with open( '/tmp/amazon.html', 'w' ) as f:
        #    f.write( response.body )

        logger.info('fetch : ' + response.url)
        prdid = response.url.split('?')[0].split('/')[-1]

        review = f_xpath(response, '//table[@id="productReviews"]/tr/td')
        if review is None:
            yield scrapy.http.Request(url=next_product_url(),
                                      callback=self.parse)

        rids = xpath_extract(review, './a/@name')
        details = xpath(review, './div')

        lenth = min(len(rids), len(details))
        for i in xrange(lenth):

            rdetail = details[i]
            divs = xpath(rdetail, './div')

            # max of len( divs ) is 7, ( 0 - 6 )
            # 0 : number of helpful review
            # 1 : star, helpful text, date
            # 2 : reviewer, reviewer from
            # 3 : from
            # 4 : free product
            # 5 : reviewText
            # 6 : helpful?

            d = self.empty_item()
            d['prdid'] = prdid
            d['rid'] = rids[i]
            d['text'] = ' '.join(
                xpath_extract(rdetail, './div[@class="reviewText"]/text()'))

            while len(divs) > 0:
                div = divs[0]
                divs = divs[1:]

                text = div.extract()

                if 'people found the following review helpful' in text:
                    d['num_help_review'] = self.parse_num_help_review(div)
                    continue

                if 'out of' in text and 'stars' in text and '</nobr>' in text:
                    d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \
                                self.parse_star_help_date( div )
                    continue

                if 'By' in text and 'See all my reviews' in text:
                    d[ 'reviewer' ], d[ 'reviewer_from' ] = \
                                self.parse_reviewer_from( div )
                    continue

                if 'This review is from' in text:
                    d['rfrom'] = self.parse_from(div)

                break

            yield d

        next_url = self.next_page( response ) or \
                        next_product_url()

        # see http://doc.scrapy.org/en/latest/topics/request-response.html
        yield scrapy.http.Request(url=next_url,
                                  callback=self.parse,
                                  dont_filter=True)
Exemple #21
0
    def parse(self, response):
        #with open( '/tmp/amazon.html', 'w' ) as f:
        #    f.write( response.body )

        logger.info( 'fetch : ' + response.url )
        prdid = response.url.split( '?' )[ 0 ].split( '/' )[ -1 ]

        review = f_xpath( response, '//table[@id="productReviews"]/tr/td' )
        if review is None:
            yield scrapy.http.Request( url = next_product_url(),
                                       callback = self.parse )

        rids = xpath_extract( review, './a/@name' )
        details = xpath( review, './div' )

        lenth = min( len( rids ), len( details ) )
        for i in xrange( lenth ):

            rdetail = details[ i ]
            divs = xpath( rdetail, './div' )

            # max of len( divs ) is 7, ( 0 - 6 )
            # 0 : number of helpful review
            # 1 : star, helpful text, date
            # 2 : reviewer, reviewer from
            # 3 : from
            # 4 : free product
            # 5 : reviewText
            # 6 : helpful?

            d = self.empty_item()
            d[ 'prdid' ] = prdid
            d[ 'rid' ] = rids[ i ]
            d[ 'text' ] = ' '.join( xpath_extract( rdetail,
                                    './div[@class="reviewText"]/text()' ) )

            while len( divs ) > 0:
                div = divs[ 0 ]
                divs = divs[ 1: ]

                text = div.extract()

                if 'people found the following review helpful' in text:
                    d[ 'num_help_review' ] = self.parse_num_help_review( div )
                    continue

                if 'out of' in text and 'stars' in text and '</nobr>' in text:
                    d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \
                                self.parse_star_help_date( div )
                    continue

                if 'By' in text and 'See all my reviews' in text:
                    d[ 'reviewer' ], d[ 'reviewer_from' ] = \
                                self.parse_reviewer_from( div )
                    continue

                if 'This review is from' in text:
                    d[ 'rfrom' ] = self.parse_from( div )

                break

            yield d

        next_url = self.next_page( response ) or \
                        next_product_url()

        # see http://doc.scrapy.org/en/latest/topics/request-response.html
        yield scrapy.http.Request( url = next_url, callback = self.parse,
                                   dont_filter = True )
Exemple #22
0
 def next_top_img( self, response ):
     urls = f_xpath( response, '//div[contains(@style, "margin-bottom")]' ).xpath( './div' )[1:]
     for url in urls:
         nxt = fx_extract( url, './a[contains(text(), "Next")]/@href' )
         if nxt:
             return nxt