import csv import time import json import scrapy from amazon.utils.util import escape from amazon.utils import genlog from amazon.item.findnsave import FindnsaveBrandItem from amazon.utils.rediscli import get_cli, RedisLock, RedisLockError from amazon.utils.util import first_item, safe, \ xpath, f_xpath, first_item_xpath, \ xpath_extract, fx_extract, first_item_xpath_extract logger = genlog.createlogger('findnsave_brands') genlog.logger = logger class FindnsaveBrandsSpider(scrapy.Spider): logger = logger name = 'findnsavebrands' allowed_domains = ("findnsave.com", ) location = 'newyork' rooturl = "http://%s.findnsave.com" % location start_urls = [rooturl + "/brands/"] #csv_fd = open( '/tmp/brands.csv', 'w' ) #csv.writer( csv_fd ).writerow( [ 'id', 'cid', 'name', 'href' ] )
import csv import time import json import scrapy from amazon.utils import genlog from amazon.utils.util import escape from amazon.item.findnsave import FindnsaveSaleItem from amazon.utils.util import first_item, safe, \ xpath, f_xpath, first_item_xpath, \ xpath_extract, fx_extract, first_item_xpath_extract logger = genlog.createlogger( 'findnsave_sales' ) genlog.logger = logger class FindnsaveStoresSpider(scrapy.Spider): logger = logger name = 'findnsavesales' allowed_domains = ( "findnsave.com", ) location = 'newyork' rooturl = "http://%s.findnsave.com" % location start_urls = [ rooturl + "/store/Walmart/10175/" ] #start_urls = [ rooturl + "/store/Target/10002/" ] #start_urls = [ rooturl + "/store/ToysRUs/10011/" ] #csv_fd = open( '/tmp/newyork_sales.csv', 'w' ) #writer = csv.writer( csv_fd, delimiter = '\\' )
import csv import time import json import scrapy from amazon.utils import genlog from amazon.item.findnsave import FindnsaveCategoryItem from amazon.utils.rediscli import get_cli, RedisLock, RedisLockError from amazon.utils.util import first_item, safe, \ xpath, f_xpath, first_item_xpath, \ xpath_extract, fx_extract, first_item_xpath_extract logger = genlog.createlogger('findnsave_categories') genlog.logger = logger class FindnsaveCategoriesSpider(scrapy.Spider): logger = logger name = 'findnsavecategories' allowed_domains = ("findnsave.com", ) location = 'newyork' rooturl = "http://%s.findnsave.com" % location start_urls = [rooturl + "/categories/"] #csv_fd = open( '/tmp/categories.csv', 'w' ) #csv.writer( csv_fd ).writerow( [ 'id', 'cid', 'name', 'href' ] ) def parse(self, response):
import json import scrapy from amazon.utils import genlog from amazon.utils.s3clientutil import authedclient, put_file_from_url from amazon.utils.util import first_item, safe, \ xpath, f_xpath, first_item_xpath, \ xpath_extract, fx_extract, first_item_xpath_extract logger = genlog.createlogger('earthpics') class EarthPicsSpider(scrapy.Spider): name = 'earthpics' allowed_domains = ("earthpics.me", ) start_urls = ["http://earthpics.me/"] prefix_len = len('http://earthpics.me/') @safe def parse_one_top(self, response): logger.info('fetch : ' + response.url) img = f_xpath(response, '//div[contains(@class, "inner-main-content")]') meta = {} meta['name'] = fx_extract(img, './div/h3/text()').strip().strip('#') meta['img'] = fx_extract(img, './/div[@class="inner-image"]/img/@src')
import csv import time import json import scrapy from amazon.utils import genlog from amazon.item.findnsave import FindnsaveCategoryItem from amazon.utils.rediscli import get_cli, RedisLock, RedisLockError from amazon.utils.util import first_item, safe, \ xpath, f_xpath, first_item_xpath, \ xpath_extract, fx_extract, first_item_xpath_extract logger = genlog.createlogger( 'findnsave_categories' ) genlog.logger = logger class FindnsaveCategoriesSpider(scrapy.Spider): logger = logger name = 'findnsavecategories' allowed_domains = ( "findnsave.com", ) location = 'newyork' rooturl = "http://%s.findnsave.com" % location start_urls = [ rooturl + "/categories/" ] #csv_fd = open( '/tmp/categories.csv', 'w' ) #csv.writer( csv_fd ).writerow( [ 'id', 'cid', 'name', 'href' ] ) def parse(self, response):
import csv import time import json import scrapy from amazon.utils.util import escape from amazon.utils import genlog from amazon.item.findnsave import FindnsaveBrandItem from amazon.utils.rediscli import get_cli, RedisLock, RedisLockError from amazon.utils.util import first_item, safe, \ xpath, f_xpath, first_item_xpath, \ xpath_extract, fx_extract, first_item_xpath_extract logger = genlog.createlogger( 'findnsave_brands' ) genlog.logger = logger class FindnsaveBrandsSpider(scrapy.Spider): logger = logger name = 'findnsavebrands' allowed_domains = ( "findnsave.com", ) location = 'newyork' rooturl = "http://%s.findnsave.com" % location start_urls = [ rooturl + "/brands/" ] #csv_fd = open( '/tmp/brands.csv', 'w' ) #csv.writer( csv_fd ).writerow( [ 'id', 'cid', 'name', 'href' ] )
import csv import time import json import scrapy from amazon.utils import genlog from amazon.item.findnsave import FindnsaveStoreItem from amazon.utils.rediscli import get_cli, RedisLock, RedisLockError from amazon.utils.util import first_item, safe, \ xpath, f_xpath, first_item_xpath, \ xpath_extract, fx_extract, first_item_xpath_extract logger = genlog.createlogger( 'findnsave_stores' ) genlog.logger = logger class FindnsaveStoresSpider(scrapy.Spider): logger = logger name = 'findnsavestores' allowed_domains = ( "findnsave.com", ) location = 'newyork' rooturl = "http://%s.findnsave.com" % location start_urls = [ rooturl + "/stores/?sort=top" ] #csv_fd = open( '/tmp/stores.csv', 'w' ) #csv.writer( csv_fd ).writerow( [ 'id', 'sid', 'name', 'href' ] ) def parse(self, response):
import csv import time import json import scrapy from amazon.utils import genlog from amazon.item.findnsave import FindnsaveAreaItem from amazon.utils.util import first_item, safe, \ xpath, f_xpath, first_item_xpath, \ xpath_extract, fx_extract, first_item_xpath_extract logger = genlog.createlogger( 'findnsave_location' ) genlog.logger = logger class FindnsaveLocationSpider(scrapy.Spider): logger = logger name = 'findnsavelocation' allowed_domains = ( "findnsave.com", ) rooturl = "http://findnsave.com" start_urls = [ rooturl + "/?markets=1" ] def parse(self, response): logger.info( 'fetch : ' + response.url ) states = f_xpath( response, '//select[@id="states-dropdown"]' ).xpath( './option' ) sts = {}
import json import scrapy from amazon.utils import genlog from amazon.utils.s3clientutil import authedclient, put_file_from_url from amazon.utils.util import first_item, safe, \ xpath, f_xpath, first_item_xpath, \ xpath_extract, fx_extract, first_item_xpath_extract logger = genlog.createlogger( 'earthpics' ) class EarthPicsSpider(scrapy.Spider): name = 'earthpics' allowed_domains = ( "earthpics.me", ) start_urls = [ "http://earthpics.me/" ] prefix_len = len( 'http://earthpics.me/' ) @safe def parse_one_top( self, response ): logger.info( 'fetch : ' + response.url ) img = f_xpath( response, '//div[contains(@class, "inner-main-content")]' ) meta = {} meta[ 'name' ] = fx_extract( img, './div/h3/text()' ).strip().strip('#') meta[ 'img' ] = fx_extract( img, './/div[@class="inner-image"]/img/@src' ) meta[ 'key' ] = meta[ 'img' ][ self.prefix_len: ]