def parse_one_top(self, response): logger.info('fetch : ' + response.url) img = f_xpath(response, '//div[contains(@class, "inner-main-content")]') meta = {} meta['name'] = fx_extract(img, './div/h3/text()').strip().strip('#') meta['img'] = fx_extract(img, './/div[@class="inner-image"]/img/@src') meta['key'] = meta['img'][self.prefix_len:] meta['from'] = fx_extract(img, './/div[@class="inner-image"]/a/@href') meta['desc'] = fx_extract(img, './div/p/text()') curr_meta = response.meta curr_meta['top'].append(meta) nexturl = self.next_top_img(response) if nexturl: yield scrapy.http.Request(url=nexturl, callback=self.parse_one_top, meta=curr_meta, dont_filter=True) else: cli = authedclient() cli.upload_data(curr_meta['key'], json.dumps(curr_meta), headers={'Content-Type': 'text/json'}) logger.info('upload : ' + curr_meta['key']) for meta in curr_meta['top']: put_file_from_url(cli, meta['key'], meta['img']) logger.info('upload : %s from %s' % (meta['key'], meta['img']))
def parse_one_top( self, response ): logger.info( 'fetch : ' + response.url ) img = f_xpath( response, '//div[contains(@class, "inner-main-content")]' ) meta = {} meta[ 'name' ] = fx_extract( img, './div/h3/text()' ).strip().strip('#') meta[ 'img' ] = fx_extract( img, './/div[@class="inner-image"]/img/@src' ) meta[ 'key' ] = meta[ 'img' ][ self.prefix_len: ] meta[ 'from' ] = fx_extract( img, './/div[@class="inner-image"]/a/@href' ) meta[ 'desc' ] = fx_extract( img, './div/p/text()' ) curr_meta = response.meta curr_meta[ 'top' ].append( meta ) nexturl = self.next_top_img( response ) if nexturl: yield scrapy.http.Request( url = nexturl, callback = self.parse_one_top, meta = curr_meta, dont_filter = True ) else: cli = authedclient() cli.upload_data( curr_meta[ 'key' ], json.dumps( curr_meta ), headers = { 'Content-Type' : 'text/json' } ) logger.info( 'upload : ' + curr_meta[ 'key' ] ) for meta in curr_meta[ 'top' ]: put_file_from_url( cli, meta[ 'key' ], meta[ 'img' ] ) logger.info( 'upload : %s from %s' % ( meta[ 'key' ], meta[ 'img' ] ) )
def _parse_callout(self, p): callout = f_xpath( p, './div[@class="callout"]' ) pct_off = '' if callout: pct = fx_extract( callout, './span[@class="pct"]/text()' ) or '' off = fx_extract( callout, './span[@class="off"]/text()' ) or '' pct_off = (pct + ' ' + off).strip() return pct_off
def next_page( self, response ): nexturl = f_xpath( response, '//table[@class="CMheadingBar"]/tr/td[1]/div/span' ) if nexturl is None: return None return fx_extract( nexturl, './a[contains(text(), "Next")]/@href' )
def _parse_price(self, p): price = f_xpath( p, './div[@class="product-price"]' ) p_c = fx_extract( price, './/span[@itemprop="priceCurrency"]/@content' ) or '' p_p = fx_extract( price, './/span[@class="price"]/@content' ) or '-1' p_r = fx_extract( price, './/span[@class="regular-price"]/text()' ) or '' p_u = fx_extract( price, './/span[@itemprop="priceValidUntil"]/@content' ) or '' try: float( p_p ) except Exception: p_p = '-1' if p_c == 'USD': if '$' in p_r: p_r = p_r.split('$')[1].split()[0] return p_c, p_p, p_r, p_u
def _parse_date(self, response): date = f_xpath( response, '//div[contains(@class, "offer-pdp-hd") ' + \ ' and contains(@class, "clearfix")]' ) starts = '' expiration = '' if not date: return starts, expiration st = f_xpath( date, './/div[@class="offer-right"]' ) if st: starts = fx_extract( st, './h2/text()' ) or '' exp = f_xpath( date, './/div[contains(@class, "expiration") ' + \ ' and contains(@class, "with-date") ]' ) if exp: expiration = fx_extract( exp, './div/text()' ) or '' return starts, expiration
def next_page(self, response): nexturl = f_xpath(response, '//table[@class="CMheadingBar"]/tr/td[1]/div/span') if nexturl is None: return None return fx_extract(nexturl, './a[contains(text(), "Next")]/@href')
def next_top_img(self, response): urls = f_xpath( response, '//div[contains(@style, "margin-bottom")]').xpath('./div')[1:] for url in urls: nxt = fx_extract(url, './a[contains(text(), "Next")]/@href') if nxt: return nxt
def parse_num_help_review( self, div ): # text : 3 of 3 people found the following review helpful text = fx_extract( div, './text()' ) if text is None: return None text = text.strip().split() return ( int( text[ 0 ] ), int( text[ 2 ] ) )
def store_next_page( self, response ): nexturl = f_xpath( response, '//div[@class="pagination"]/span[@class="next"]' ) if nexturl is None: return None uri = fx_extract( nexturl, './a[contains(text(), "Next")]/@href' ) if not uri: return None return self.rooturl + uri
def parse_num_help_review(self, div): # text : 3 of 3 people found the following review helpful text = fx_extract(div, './text()') if text is None: return None text = text.strip().split() return (int(text[0]), int(text[2]))
def _parse_star(self, div): # text : 5.0 out of 5 stars star = fx_extract(div, './span[1]/span/span/text()') if star is None: return None else: star = star.strip().split() star = (float(star[0]), float(star[3])) return star
def _parse_star( self, div ): # text : 5.0 out of 5 stars star = fx_extract( div, './span[1]/span/span/text()' ) if star is None: return None else: star = star.strip().split() star = ( float( star[ 0 ] ), float( star[ 3 ] ) ) return star
def parse(self, response): #with open( '/tmp/findnsave_sales.html', 'w' ) as f: # f.write( response.body ) logger.info( 'fetch : ' + response.url ) sales = f_xpath( response, '//ul[contains(@class, "listing") ' + \ ' and contains(@class, "retailer-detail")' + \ ' and contains(@class, "infinite")]' ).xpath( './li[starts-with(@id, "offer-")]' ) for s in sales: s = f_xpath( s, './div' ).xpath( './a' ) id = fx_extract( s, './@data-offer-id' ) href = fx_extract( s, './@href' ) th_img = fx_extract( s, './img/@src' ) if not ( id and href and th_img ): continue # TODO : id if in db continue if not href.startswith( 'http://' ): href = self.rooturl + href meta = { 'id' : id, 'href' : href, 'th_img' : th_img } yield scrapy.http.Request( url = href, callback = self.parse_one_sale, meta = meta, dont_filter = True ) next_url = self.store_next_page( response ) if next_url is None: return yield scrapy.http.Request( url = next_url, callback = self.parse, dont_filter = True )
def parse(self, response): logger.info( 'fetch : ' + response.url ) states = f_xpath( response, '//select[@id="states-dropdown"]' ).xpath( './option' ) sts = {} for st in states: st_short = fx_extract( st, './@value' ) st_name = fx_extract( st, './text()' ) if not st_short: continue if st_short not in sts: sts[ st_short ] = st_name states = xpath( response, '//ul[contains(@class, "hide") ' + \ ' and contains(@class, "clearfix")]' ) #state_fd = open( '/tmp/state_url.csv', 'w' ) #csvw = csv.writer( state_fd ) for st in states: st_short = fx_extract( st, './@id' ) locs = st.xpath( './li' ) for loc in locs: url = fx_extract( loc, './a/@href' ) area = fx_extract( loc, './a/text()' ) #csvw.writerow( [ st_short, sts.get( st_short, '' ), area, url ] ) if st_short not in sts: continue d = FindnsaveAreaItem() d[ 'area' ] = area d[ 'short' ] = st_short d[ 'state' ] = sts[ st_short ] d[ 'url' ] = url yield d
def parse(self, response): logger.info('fetch : ' + response.url) catgos = f_xpath( response, '//ul[contains(@class, "listing") ' + \ ' and contains(@class, "grouping")' + \ ' and contains(@class, "infinite")]' ).xpath( './li' ) for ctg in catgos: ctg = f_xpath(ctg, './/div[@class="chiclet-actions"]/a') if not ctg: continue href = fx_extract(ctg, './@href') name = fx_extract(ctg, './@title') name = self.parse_categorie_name(name) try: _c, cid, id = href.strip('/').split('/') except: continue #csv.writer( self.csv_fd ).writerow( [ id, cid, name, href ] ) d = FindnsaveCategoryItem() d['id'] = id d['name'] = name d['nameid'] = cid d['uri'] = href yield d next_url = self.categorie_next_page(response) if next_url is None: return yield scrapy.http.Request(url=next_url, callback=self.parse, dont_filter=True)
def parse(self, response): logger.info( 'fetch : ' + response.url ) tops = f_xpath( response, '//ul[contains(@class, "thumbnails")]' ).xpath( './li' ) for top in tops: top = f_xpath( top, './div[@class="thumbnail"]' ) if not top: continue name = fx_extract( top, './p/strong/text()' ) href = fx_extract( top, './a/@href' ).strip() curr_meta = {} curr_meta[ 'name' ] = name curr_meta[ 'url' ] = href curr_meta[ 'key' ] = 'meta/' + href[ self.prefix_len: ] + '.json' curr_meta[ 'top' ] = [] yield scrapy.http.Request( url = href, callback = self.parse_one_top, meta = curr_meta, dont_filter = True )
def parse(self, response): logger.info('fetch : ' + response.url) brands = f_xpath( response, '//ul[contains(@class, "brands") ' + \ ' and contains(@class, "columnize")' + \ ' and contains(@class, "clearfix")]' ).xpath( './li' ) for br in brands: br = f_xpath(br, './a') if not br: continue href = fx_extract(br, './@href') name = fx_extract(br, './text()') try: _b, bid, id = href.strip('/').split('/') except: continue #csv.writer( self.csv_fd ).writerow( [ id, bid, name, href ] ) d = FindnsaveBrandItem() d['id'] = id d['name'] = escape(name) d['nameid'] = bid d['uri'] = href yield d next_url = self.brand_next_page(response) if next_url is None: return yield scrapy.http.Request(url=next_url, callback=self.parse, dont_filter=True)
def parse(self, response): logger.info( 'fetch : ' + response.url ) catgos = f_xpath( response, '//ul[contains(@class, "listing") ' + \ ' and contains(@class, "grouping")' + \ ' and contains(@class, "infinite")]' ).xpath( './li' ) for ctg in catgos: ctg = f_xpath( ctg, './/div[@class="chiclet-actions"]/a' ) if not ctg: continue href = fx_extract( ctg, './@href' ) name = fx_extract( ctg, './@title' ) name = self.parse_categorie_name( name ) try: _c, cid, id = href.strip( '/' ).split( '/' ) except: continue #csv.writer( self.csv_fd ).writerow( [ id, cid, name, href ] ) d = FindnsaveCategoryItem() d['id'] = id d['name'] = name d['nameid'] = cid d['uri'] = href yield d next_url = self.categorie_next_page( response ) if next_url is None: return yield scrapy.http.Request( url = next_url, callback = self.parse, dont_filter = True )
def parse_one_sale(self, response): #with open( '/tmp/findnsave_sales_one.html', 'w' ) as f: # f.write( response.body ) sale = f_xpath( response, '//div[contains(@class, "offer-description-wrapper") ' + \ ' and contains(@class, "clearfix")]' ) if not sale: return starts, expiration = self._parse_date( response ) pct_off = self._parse_callout( sale ) lg_img = self._parse_large_img( sale ) sr = f_xpath( sale, './div[@class="offer-right"]' ) name = fx_extract( sr, './h1[@itemprop="name"]/text()' ) if name is None: logger.debug( 'not crawl name in : ' + response.url ) return p_c, p_p, p_r, p_u = self._parse_price( sr ) desc = self._parse_desc( sr ) retailer, category, brand = self._parse_retailer_category_brand( sr ) data = [ response.meta[ 'id' ], name, p_c, p_p, p_r, p_u, pct_off, starts, expiration, retailer, category, brand, response.url, response.meta[ 'th_img' ], lg_img, desc, ] d = FindnsaveSaleItem() d['area'] = 'newyork' d['id'] = response.meta[ 'id' ] d['name'] = escape(name) d['priceCurrency'] = p_c d['price'] = p_p d['priceRegular'] = p_r d['priceUtilDate'] = p_u d['priceOff'] = pct_off d['retailer'] = escape(retailer) d['category'] = escape(category) d['brand'] = escape(brand) d['desc'] = escape(desc) yield d #self.jsonfile.write( json.dumps( data ) + '\n' ) logger.info( 'crawl : `' + name + '` OK' ) return
def parse(self, response): logger.info('fetch : ' + response.url) tops = f_xpath(response, '//ul[contains(@class, "thumbnails")]').xpath('./li') for top in tops: top = f_xpath(top, './div[@class="thumbnail"]') if not top: continue name = fx_extract(top, './p/strong/text()') href = fx_extract(top, './a/@href').strip() curr_meta = {} curr_meta['name'] = name curr_meta['url'] = href curr_meta['key'] = 'meta/' + href[self.prefix_len:] + '.json' curr_meta['top'] = [] yield scrapy.http.Request(url=href, callback=self.parse_one_top, meta=curr_meta, dont_filter=True)
def parse(self, response): logger.info( 'fetch : ' + response.url ) brands = f_xpath( response, '//ul[contains(@class, "brands") ' + \ ' and contains(@class, "columnize")' + \ ' and contains(@class, "clearfix")]' ).xpath( './li' ) for br in brands: br = f_xpath( br, './a' ) if not br: continue href = fx_extract( br, './@href' ) name = fx_extract( br, './text()' ) try: _b, bid, id = href.strip( '/' ).split( '/' ) except: continue #csv.writer( self.csv_fd ).writerow( [ id, bid, name, href ] ) d = FindnsaveBrandItem() d['id'] = id d['name'] = escape(name) d['nameid'] = bid d['uri'] = href yield d next_url = self.brand_next_page( response ) if next_url is None: return yield scrapy.http.Request( url = next_url, callback = self.parse, dont_filter = True )
def parse_from(self, div): return fx_extract(div, './b/text()') or ''
def parse_reviewer_from( self, div ): return ( fx_extract( div, './div/div[2]/a/span/text()' ) or '', ( fx_extract( div, './div/div[2]/text()' ) or '' ).strip( ' -' ) )
def parse_from( self, div ): return fx_extract( div, './b/text()' ) or ''
def next_top_img( self, response ): urls = f_xpath( response, '//div[contains(@style, "margin-bottom")]' ).xpath( './div' )[1:] for url in urls: nxt = fx_extract( url, './a[contains(text(), "Next")]/@href' ) if nxt: return nxt
def parse_star_help_date( self, div ): return ( self._parse_star( div ), fx_extract( div, './span[2]/b/text()' ) or '', fx_extract( div, './span[2]/nobr/text()' ) or '' )
def _parse_large_img(self, p): return fx_extract( p, './div[@class="offer-left"]' + \ '/div[contains(@class, "large")]' + \ '/img/@src' ) or ''
def parse_star_help_date(self, div): return (self._parse_star(div), fx_extract(div, './span[2]/b/text()') or '', fx_extract(div, './span[2]/nobr/text()') or '')
def _parse_retailer_category_brand(self, p): retailer = (fx_extract( p, './p[@class="retailer"]/a/text()' ) or '').strip() category = (fx_extract( p, './p[@class="parentCategory"]/a/text()' ) or '').strip() brand = (fx_extract( p, './p[@class="brand"]/a/text()' ) or '').strip() return retailer, category, brand
def parse_reviewer_from(self, div): return (fx_extract(div, './div/div[2]/a/span/text()') or '', (fx_extract(div, './div/div[2]/text()') or '').strip(' -'))