class RecipeDetail(CrawlSpider): name = "recipe_detail" base_url = 'https://www.xiachufang.com' header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive', 'Host': 'www.xiachufang.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', } def __init__(self, *a , **kw): super(RecipeDetail, self).__init__(*a, **kw) self.dir_name = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_name) def init(self): command = ( "CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`name` CHAR(20) NOT NULL COMMENT 'recipe name'," "`recipe_id` INT(12) NOT NULL COMMENT 'recipe ID'," "`source_name` CHAR(20) NOT NULL COMMENT 'source name'," "`source_id` INT(8) NOT NULL COMMENT 'source ID'," "`create_time` DATETIME NOT NULL," "PRIMARY KEY(id)" ") ENGINE=InnoDB".format(config.item_detail_table) ) self.sql.create_table(command) def start_requests(self): command = "SELECT * from {}".format(config.item_list_table) data = self.sql.query(command) for i, recipe in enumerate(data): if recipe[0] > 8999 and recipe[0] < 10000: url = self.base_url + recipe[2] utils.log(url) yield Request( url = url, headers = self.header, callback = self.parse_all, errback = self.error_parse, meta={"re_id": recipe[3], "re_name": recipe[1]}, dont_filter = True, ) def parse_all(self, response): utils.log(response.url) if response.status == 429: raise CloseSpider('Too much request, IP banned') if response.status == 200: file_name = '%s/recipe.html' % (self.dir_name) self.save_page(file_name, response.body) sources = response.xpath("//div[@class='ings']//tr").extract() for source in sources: sel = Selector(text = source) source_name = sel.xpath("//a/text()").extract_first() url = sel.xpath("//a/@href").extract_first() if source_name is not None and url is not None: source_id = url.split('/')[-2] r_name = response.meta["re_name"] r_id = response.meta["re_id"] dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") msg = (None, r_name, r_id, source_name, source_id, dt) command = ("INSERT IGNORE INTO {} " "(id, name, recipe_id, source_name, source_id, create_time)" "VALUES(%s,%s,%s,%s,%s,%s)".format(config.item_detail_table) ) self.sql.insert_data(command, msg) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()
class GameUrls(Spider): name = 'game_urls' start_urls = [ 'http://store.steampowered.com/search/?sort_by=Released_DESC&page=%s' % n for n in range(1, 1058) ] def __init__(self, *a, **kw): super(GameUrls, self).__init__(*a, **kw) self.dir_game = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_game) def init(self): command = ("CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`type` CHAR(10) NOT NULL," "`name` TEXT NOT NULL," "`url` TEXT NOT NULL," "`is_crawled` CHAR(5) DEFAULT 'no'," "`page` INT(5) NOT NULL ," "PRIMARY KEY(id)" ") ENGINE=InnoDB".format(config.steam_game_urls_table)) self.sql.create_table(command) def start_requests(self): for i, url in enumerate(self.start_urls): yield Request( url=url, headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Host': 'store.steampowered.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 ' 'Firefox/51.0', }, meta={ 'url': url, 'page': i + 1, }, dont_filter=True, callback=self.parse_all, errback=self.error_parse, ) def parse_all(self, response): # file_name = '%s/%s.html' % (self.dir_game, response.meta.get('page')) # self.save_page(file_name, response.body) self.log('parse_all url:%s' % response.url) game_list = response.xpath( '//div[@id="search_result_container"]/div[2]/a').extract() count = 0 for game in game_list: sel = Selector(text=game) url = sel.xpath('//@href').extract_first() id, type = self.get_id(url) # id = sel.xpath('//@data-ds-appid').extract_first() name = sel.xpath( '//div[@class="col search_name ellipsis"]/span/text()' ).extract_first() msg = (None, type, name, url, 'no', response.meta.get('page')) command = ("INSERT IGNORE INTO {} " "(id, type, name, url, is_crawled, page)" "VALUES(%s, %s, %s, %s, %s, %s)".format( config.steam_game_urls_table)) self.sql.insert_data(command, msg) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def get_id(self, url): type = '' if '/sub/' in url: pattern = re.compile('/sub/(\d+)/') type = 'sub' elif '/app/' in url: pattern = re.compile('/app/(\d+)/', re.S) type = 'app' elif '/bundle/' in url: pattern = re.compile('/bundle/(\d+)/', re.S) type = 'bundle' else: pattern = re.compile('/(\d+)/', re.S) type = 'other' utils.log('get_id other url:%s' % url) id = re.search(pattern, url) if id: id = id.group(1) return id, type utils.log('get_id error url:%s' % url) return 0, 'error' def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()
class Validator(Spider): name = 'base' concurrent_requests = 16 retry_enabled = False def __init__(self, name=None, **kwargs): super(Validator, self).__init__(name, **kwargs) self.sql = SqlHelper() self.dir_log = 'log/validator/%s' % self.name self.timeout = 10 self.urls = [] self.headers = None self.success_mark = '' self.is_record_web_page = False def init(self): utils.make_dir(self.dir_log) command = utils.get_create_table_command(self.name) self.sql.create_table(command) @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or { 'CONCURRENT_REQUESTS': cls.concurrent_requests, 'RETRY_ENABLED': cls.retry_enabled, }, priority='spider') def start_requests(self): count = utils.get_table_length(self.sql, self.name) count_free = utils.get_table_length(self.sql, config.httpbin_table) ids = utils.get_table_ids(self.sql, self.name) ids_free = utils.get_table_ids(self.sql, config.httpbin_table) for i in range(0, count + count_free): table = self.name if (i < count) else config.httpbin_table id = ids[i] if i < count else ids_free[i - len(ids)] proxy = utils.get_proxy_info(self.sql, table, id) if proxy == None: continue for url in self.urls: cur_time = time.time() yield Request( url=url, headers=self.headers, meta={ 'cur_time': cur_time, 'download_timeout': self.timeout, 'proxy_info': proxy, 'table': table, 'id': proxy.get('id'), 'proxy': 'http://%s:%s' % (proxy.get('ip'), proxy.get('port')), 'vali_count': proxy.get('vali_count', 0) }, dont_filter=True, callback=self.success_parse, errback=self.error_parse, ) def success_parse(self, response): utils.log('success_parse speed:%s meta:%s' % (time.time() - response.meta.get('cur_time'), response.meta)) proxy = response.meta.get('proxy_info') table = response.meta.get('table') id = response.meta.get('id') ip = proxy.get('ip') self.save_page(ip, response.body) if self.success_mark in response.body or self.success_mark is '': speed = time.time() - response.meta.get('cur_time') if table == self.name: if speed > self.timeout: command = utils.get_delete_data_command(table, id) self.sql.execute(command) else: vali_count = response.meta.get('vali_count', 0) + 1 command = utils.get_update_data_command( table, id, speed, vali_count) self.sql.execute(command) else: if speed < self.timeout: command = utils.get_insert_data_command(self.name) msg = (None, proxy.get('ip'), proxy.get('port'), proxy.get('country'), proxy.get('anonymity'), proxy.get('https'), speed, proxy.get('source'), None, 1) self.sql.insert_data(command, msg, commit=True) else: # 如果没有找到成功标示,说明这里返回信息有误,需要删除当前库的 ip if table == self.name: command = utils.get_delete_data_command(table, id) self.sql.execute(command) def error_parse(self, failure): request = failure.request utils.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta)) proxy = failure.request.meta.get('proxy_info') table = failure.request.meta.get('table') id = failure.request.meta.get('id') if table == self.name: command = utils.get_delete_data_command(table, id) self.sql.execute(command) else: # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理 pass # # request = failure.request.meta # utils.log('request meta:%s' % str(request)) # # # log all errback failures, # # in case you want to do something special for some errors, # # you may need the failure's type # self.logger.error(repr(failure)) # # #if isinstance(failure.value, HttpError): # if failure.check(HttpError): # # you can get the response # response = failure.value.response # self.logger.error('HttpError on %s', response.url) # # #elif isinstance(failure.value, DNSLookupError): # elif failure.check(DNSLookupError): # # this is the original request # request = failure.request # self.logger.error('DNSLookupError on %s', request.url) # # #elif isinstance(failure.value, TimeoutError): # elif failure.check(TimeoutError): # request = failure.request # self.logger.error('TimeoutError on url:%s', request.url) def save_page(self, ip, data): filename = '{time} {ip}'.format( time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'), ip=ip) utils.log('filename:%s' % filename) if self.is_record_web_page: with open('%s/%s.html' % (self.dir_log, filename), 'w') as f: f.write(data) f.close() def close(spider, reason): spider.sql.commit()
# with open('%s.html' % city, 'w') as f: # f.write(driver.page_source) # f.close() # # sel = Selector(text = driver.page_source) # cs = sel.xpath('//div[@class="data-list"]/ul/li/a').extract() # for c in cs: # s = Selector(text = c) # msg = (s.xpath('//@data-code').extract_first(), s.xpath('//text()').extract_first()) # command = ("INSERT IGNORE INTO {} (id, name) VALUES(%s, %s)".format('liepin_city_id')) # # sql.insert_data(command, msg) # # driver.find_element_by_xpath('//li[@data-selector="tab-all"]').click() # time.sleep(4) with open('liepin_2.html', 'r') as f: text = f.read() f.close() sel = Selector(text=text) cs = sel.xpath('////li/a[@class="d-item"]').extract() for c in cs: s = Selector(text=c) msg = (s.xpath('//@data-code').extract_first(), s.xpath('//text()').extract_first()) command = ("INSERT IGNORE INTO {} (id, name) VALUES(%s, %s)".format( 'liepin_city_id')) sql.insert_data(command, msg)
# encoding=utf-8 from sqlhelper import SqlHelper sql = SqlHelper() def insert_data_to_users(): command = ("INSERT IGNORE INTO users " "(id, name, remark)" "VALUES(%s, %s, %s)") return command command = insert_data_to_users() msg = ( "112", "11", "",) sql.insert_data(command, msg, commit = True) print ('created user success')
class User(Spider): name = "user_urls" start_url = 'https://www.xiachufang.com' header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive', 'Host': 'www.xiachufang.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', } def __init__(self, *a, **kw): super(User, self).__init__(*a, **kw) self.dir_name = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_name) def init(self): command = ("CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`name` CHAR(20) NOT NULL COMMENT 'user_name'," "`user_id` INT(12) NOT NULL COMMENT 'user_ID'," "`url` TEXT NOT NULL COMMENT 'user_url'," "`create_time` DATETIME NOT NULL," "PRIMARY KEY(id)," "UNIQUE KEY `user_id` (`user_id`)" ") ENGINE=InnoDB".format(config.users_urls_table)) self.sql.create_table(command) def start_requests(self): active_url = '/feature/cook/active/' url = self.start_url + active_url N = 5 for index in range(1, N): yield Request( url=url, headers=self.header, callback=self.parse_all, errback=self.error_parse, dont_filter=True, ) def parse_all(self, response): if response.status == 200: file_name = '%s/users.html' % (self.dir_name) self.save_page(file_name, response.body) users = response.xpath("//div[@class='content']/ul/li").extract() for user in users: sel = Selector(text=user) url = sel.xpath("//div[@class='name']/a/@href").extract_first() user_id = url.split('/')[-2] name = sel.xpath( "//div[@class='name']/a/text()").extract_first() dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") msg = (None, name, user_id, url, dt) command = ("INSERT IGNORE INTO {} " "(id, name, user_id, url, create_time)" "VALUES(%s,%s,%s,%s,%s)".format( config.users_urls_table)) self.sql.insert_data(command, msg) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()
class ItemDetail(CrawlSpider): name = "item_list" base_url = "http://www.xiachufang.com" header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive', 'Host': 'www.xiachufang.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', } def __init__(self, *a, **kw): super(ItemDetail, self).__init__(*a, **kw) self.dir_name = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_name) def init(self): command = ("CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`name` CHAR(20) NOT NULL COMMENT '菜肴名称'," "`url` TEXT NOT NULL COMMENT '菜肴url'," "`img` TEXT NOT NULL COMMENT '封面图'," "`item_id` INT(8) NOT NULL COMMENT '菜肴id'," "`source` TEXT NOT NULL COMMENT '原料'," "`score` CHAR(5) NOT NULL COMMENT '平分'," "`create_time` DATETIME NOT NULL," "PRIMARY KEY(id)," "UNIQUE KEY `item_id` (`item_id`)" ") ENGINE=InnoDB".format(config.item_list_table)) self.sql.create_table(command) def start_requests(self): command = "SELECT * from {}".format(config.category_urls_table) data = self.sql.query(command) for i, category in enumerate(data): url = self.base_url + category[2] utils.log(url) yield Request( url=url, headers=self.header, callback=self.parse_all, errback=self.error_parse, ) def parse_all(self, response): utils.log(response.url) if response.status == 200: file_name = '%s/category.html' % (self.dir_name) self.save_page(file_name, response.body) recipes = response.xpath( "//div[@class='normal-recipe-list']/ul/li").extract() self.parse_recipes(recipes) nextPage = response.xpath( "//div[@class='pager']/a[@class='next']/@href").extract_first( ) if nextPage: yield Request( url=self.base_url + nextPage, headers=self.header, callback=self.parse_all, errback=self.error_parse, ) def parse_recipes(self, recipes): for recipe in recipes: sel = Selector(text=recipe) name = sel.xpath( "//p[@class='name']/text()").extract_first().strip() url = sel.xpath("//a[1]/@href").extract_first() img = sel.xpath( "//div[@class='cover pure-u']/img/@data-src").extract_first() item_id = re.compile("/recipe/(.*?)/").findall(url)[0] source = sel.xpath( "//p[@class='ing ellipsis']/text()").extract_first().strip() score = sel.xpath( "//p[@class='stats']/span/text()").extract_first().strip() dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") msg = (None, name, url, img, item_id, source, score, dt) command = ( "INSERT IGNORE INTO {} " "(id, name, url, img, item_id, source, score, create_time)" "VALUES(%s,%s,%s,%s,%s,%s,%s,%s)".format( config.item_list_table)) self.sql.insert_data(command, msg) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()
class Crawler(object): def __init__(self): super(Crawler, self).__init__() self.album_prefix = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20={0}&page={1}' self.image_prefix = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id={0}&album_id={1}&page={2}' self.image_pattern = re.compile('''img.*290x10000.jpg''', re.U) self.image_name_pattern = re.compile('''"picId":"(.*?)"''', re.U) self.model_pattern = re.compile( '''<a class="lady-name" href="(.*?)".*>(.*?)</a>''', re.U) self.album_pattern = re.compile('''.*album_id=(.*?)&.*''', re.U) self.links = [] self.ids = [] self.names = [] self.sql = SqlHelper() def readHtml(self, html): response = urllib.request.urlopen(html) return response.read() def getLinkIdAndNames(self, htmlData): # htmlData = htmlData.decode('utf-8') # htmlData = htmlData.decode(); items = re.findall(self.model_pattern, htmlData) print('items:' + items) self.links = [link for link, name in items] self.names = [name.decode('gbk') for link, name in items] self.ids = [link[link.index('=') + 1:] for link in self.links] def getAlbums(self): for i, model_id in enumerate(self.ids): utils.log('start downloading:%s' % self.names[i]) print('start downloading', self.names[i]) # 插入用户 command = self.sql.insert_data_to_users() msg = ( model_id, self.names[i], "", ) try: self.sql.insert_data(command, msg, commit=True) except Exception as e: utils.log('insert users data errors') for page in range(1, 10): utils.log('current page:%s' % page) # print ('current page', page) model_url = self.album_prefix.format(model_id, page) soup = bs(self.readHtml(model_url), 'html.parser') albums = soup.find_all('div', class_='mm-photo-cell-middle') if not albums: break for album in albums: album_name = album.find('h4').a.string.strip().rstrip('.') album_link = album.find('h4').a['href'] album_id = re.findall(self.album_pattern, album_link)[0] album_create_time = album.find( 'p', class_='mm-photo-date').string.strip( u'创建时间: ').strip(u'´´½¨Ê±¼ä:') album_img_count = album.find( 'span', class_='mm-pic-number').string.strip( '()').strip(u'张').strip(u'ÕÅ') print(">>>>>>>>>>>>>>>>>>>>>>") print(album.find('p', class_='mm-photo-date').string) print(album_create_time) print(">>>>>>>>>>>>>>>>>>>>>>") # 插入相册 command = self.sql.insert_data_to_albums() msg = (album_id, model_id, album_name, album_create_time, "", 1, album_img_count) try: self.sql.insert_data(command, msg, commit=True) except Exception as e: utils.log('insert albums data errors') utils.log('start in album:%s, total size: %s' % (album_name, album_img_count)) self.getImages(model_id, album_id, album_img_count) def getImages(self, model_id, album_id, image_count): # print 'start downloading album', album_id, image_count, '张' for page in range(1, (int(image_count) - 1) / 16 + 2): link = self.image_prefix.format(model_id, album_id, page) body = self.readHtml(link).decode('gbk') images = re.findall(self.image_pattern, body) # tried to use des as names, however, it duplicates times. So i chose pic ids. names = re.findall(self.image_name_pattern, body) for idx, image in enumerate(images): image = image.replace('290', '620') try: img_url = ('http://' + image).replace( 'jpg_620x10000.jpg', 'jpg') except Exception as e: img_url = ('http://' + image) # id , album_id , name , url , kind # 插入图片 command = self.sql.insert_data_to_photos() msg = (None, album_id, "", img_url, 1) try: self.sql.insert_data(command, msg, commit=True) except Exception as e: utils.log('insert photos data errors') print('created photos success')
class Category(Spider): name = "category_urls" start_url = 'http://www.xiachufang.com/category/' def __init__(self, *a, **kw): super(Category, self).__init__(*a, **kw) self.dir_name = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_name) def init(self): command = ("CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`name` CHAR(20) NOT NULL COMMENT '分类名称'," "`url` TEXT NOT NULL COMMENT '分类url'," "`category` CHAR(20) NOT NULL COMMENT '父级分类'," "`category_id` INT(8) NOT NULL COMMENT '分类id'," "`create_time` DATETIME NOT NULL," "PRIMARY KEY(id)," "UNIQUE KEY `category_id` (`category_id`)" ") ENGINE=InnoDB".format(config.category_urls_table)) self.sql.create_table(command) def start_requests(self): yield Request( url=self.start_url, headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive', 'Host': 'www.xiachufang.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', }, callback=self.parse_all, errback=self.error_parse, ) def parse_all(self, response): if response.status == 200: file_name = '%s/category.html' % (self.dir_name) self.save_page(file_name, response.body) categorys = response.xpath( "//div[@class='cates-list-all clearfix hidden']").extract() for category in categorys: sel_category = Selector(text=category) category_father = sel_category.xpath( "//h4/text()").extract_first().strip() items = sel_category.xpath("//ul/li/a").extract() for item in items: sel = Selector(text=item) url = sel.xpath("//@href").extract_first() name = sel.xpath("//text()").extract_first() _id = re.compile('/category/(.*?)/').findall(url)[0] dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") msg = (None, name, url, category_father, _id, dt) command = ( "INSERT IGNORE INTO {} " "(id, name, url, category, category_id, create_time)" "VALUES(%s,%s,%s,%s,%s,%s)".format( config.category_urls_table)) self.sql.insert_data(command, msg) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()
class Validator(Spider): name = 'base' def __init__(self, name=None, **kwargs): super(Validator, self).__init__(name, **kwargs) self.sql = SqlHelper() self.dir_log = 'log/validator/%s' % self.name self.timeout = 10 self.urls = [] self.headers = None self.success_mark = '' def init(self): utils.make_dir(self.dir_log) command = utils.get_create_table_command(self.name) self.sql.create_table(command) def start_requests(self): count = utils.get_table_length(self.sql, self.name) count_free = utils.get_table_length(self.sql, free_ipproxy_table) for i in range(0, count + count_free): table = self.name if (i < count) else free_ipproxy_table proxy = utils.get_proxy_info(self.sql, table, i) if proxy == None: continue for url in self.urls: cur_time = time.time() yield Request( url=url, headers=self.headers, meta={ 'cur_time': cur_time, 'download_timeout': self.timeout, 'proxy_info': proxy, 'table': table, 'id': proxy.get('id'), 'proxy': 'http://%s:%s' % (proxy.get('ip'), proxy.get('port')), }, dont_filter=True, callback=self.success_parse, errback=self.error_parse, ) def success_parse(self, response): utils.log('name:%s success_parse proxy:%s meta:%s' % (self.name, str( response.meta.get('proxy_info')), str(response.meta))) filename = datetime.datetime.now().strftime('%Y-%m-%d %H_%M_%S_%f') self.save_page(filename, response.body) if response.body.find(self.success_mark) or self.success_mark is '': proxy = response.meta.get('proxy_info') speed = time.time() - response.meta.get('cur_time') table = response.meta.get('table') id = response.meta.get('id') utils.log('speed:%s table:%s id:%s' % (speed, table, id)) if table == self.name: if speed > self.timeout: command = utils.get_delete_data_command(table, id) self.sql.execute(command) else: command = utils.get_update_data_command(table, id, speed) self.sql.execute(command) else: if speed < self.timeout: command = utils.get_insert_data_command(self.name) msg = (None, proxy.get('ip'), proxy.get('port'), proxy.get('country'), proxy.get('anonymity'), proxy.get('https'), speed, proxy.get('source'), None) self.sql.insert_data(command, msg) def error_parse(self, failure): utils.log('error_parse value:%s' % failure.value) proxy = failure.request.meta.get('proxy_info') table = failure.request.meta.get('table') id = failure.request.meta.get('id') if table == self.name: command = utils.get_delete_data_command(table, id) self.sql.execute(command) else: # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理 pass # # request = failure.request.meta # utils.log('request meta:%s' % str(request)) # # # log all errback failures, # # in case you want to do something special for some errors, # # you may need the failure's type # self.logger.error(repr(failure)) # # #if isinstance(failure.value, HttpError): # if failure.check(HttpError): # # you can get the response # response = failure.value.response # self.logger.error('HttpError on %s', response.url) # # #elif isinstance(failure.value, DNSLookupError): # elif failure.check(DNSLookupError): # # this is the original request # request = failure.request # self.logger.error('DNSLookupError on %s', request.url) # # #elif isinstance(failure.value, TimeoutError): # elif failure.check(TimeoutError): # request = failure.request # self.logger.error('TimeoutError on url:%s', request.url) def save_page(self, filename, data): if get_project_settings().get('IS_RECODE_HTML', False): with open('%s/%s.html' % (self.dir_log, filename), 'w') as f: f.write(data) f.close()
class Recipe(CrawlSpider): name = "user_recipes" base_url = 'https://www.xiachufang.com' header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive', 'Host': 'www.xiachufang.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', } def __init__(self, *a , **kw): super(Recipe, self).__init__(*a, **kw) self.dir_name = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_name) def init(self): command = ( "CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`name` CHAR(20) NOT NULL COMMENT 'recipe name'," "`url` TEXT NOT NULL COMMENT 'recipe url'," "`item_id` INT(8) NOT NULL COMMENT 'recipe ID'," "`user_id` INT(12) NOT NULL COMMENT 'user ID'," "`create_time` DATETIME NOT NULL," "PRIMARY KEY(id)," "UNIQUE KEY `item_id` (`item_id`)" ") ENGINE=InnoDB".format(config.item_list_table) ) self.sql.create_table(command) def start_requests(self): command = "SELECT * from {}".format(config.users_urls_table) data = self.sql.query(command) for i, user in enumerate(data): if i > 200: page = 1 url = self.base_url + user[3] + 'created/?page=%d' % page utils.log(url) yield Request( url = url, headers = self.header, meta = {"page":page, "user_id":user[2], "user_url":user[3]}, callback = self.parse_all, errback = self.error_parse, ) def parse_all(self, response): utils.log(response.url) if response.status == 200: file_name = '%s/user.html' % (self.dir_name) self.save_page(file_name, response.body) recipes = response.xpath("//div[@class='recipes-280-full-width-list']/ul/li").extract() page = response.meta["page"] u_url = response.meta["user_url"] u_id = response.meta["user_id"] for recipe in recipes: sel = Selector(text = recipe) name = sel.xpath("//p[@class='name ellipsis red-font']/a/text()").extract_first().strip() url = sel.xpath("//p[@class='name ellipsis red-font']/a/@href").extract_first() item_id = url.split('/')[-2] dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") msg = (None, name, url, item_id, u_id, dt) command = ("INSERT IGNORE INTO {} " "(id, name, url, item_id, user_id, create_time)" "VALUES(%s,%s,%s,%s,%s,%s)".format(config.item_list_table) ) self.sql.insert_data(command, msg) page += 1 if page < 3: yield Request( url = self.base_url + u_url + 'created/?page=%d' % page, meta = {"page":page, "user_id":u_id, "user_url":u_url}, headers = self.header, callback = self.parse_all, errback = self.error_parse, dont_filter = True, ) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()
class GameInfo(CrawlSpider): name = 'game_info' def __init__(self, *a, **kw): super(GameInfo, self).__init__(*a, **kw) self.dir_game = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_game) self.error_count = 0 def init(self): command = ("CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`name` TEXT NOT NULL," "`price` INT(5) NOT NULL," "`metacritic_score` FLOAT DEFAULT NULL," "`user_reviews_count` INT(6) NOT NULL," "`positive_user_reviews_count` INT(6) NOT NULL," "`positive_percent` FLOAT NOT NULL ," "`negative_user_reviews_count` INT(6) NOT NULL," '`steam_user_reviews_count` INT(6) NOT NULL,' '`non_steam_user_reviews_count` INT(6) NOT NULL,' '`english_user_reviews_count` INT(6) NOT NULL,' '`non_english_user_reviews_count` INT(6) NOT NULL,' "`tag_list` TEXT DEFAULT NULL," "`achievements_count` INT(4) DEFAULT NULL," "`category` TEXT NOT NULL," "`genre` TEXT NOT NULL," "`developer` TEXT NOT NULL," "`publisher` TEXT NOT NULL," "`release_date` TEXT NOT NULL," "`url` TEXT NOT NULL," "`language_number` INT(3) DEFAULT NULL," "`description` TEXT DEFAULT NULL," "`save_time` TIMESTAMP NOT NULL," "PRIMARY KEY(id)" ") ENGINE=InnoDB".format(config.steam_game_info_table)) self.sql.create_table(command) def start_requests(self): command = "SELECT * FROM {} WHERE is_crawled = \'no\' AND type = \'app\'".format( config.steam_game_urls_table) data = self.sql.query(command) for i, item in enumerate(data): yield Request( url=item[3], dont_filter=True, method='GET', headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Host': 'store.steampowered.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 ' 'Firefox/51.0', }, meta={ 'item': item, 'id': item[0], }, cookies={ 'mature_content': '1', }, callback=self.parse_game, errback=self.error_parse, ) def parse_game(self, response): self.log('parse_game url:%s' % response.url) id = response.meta.get('id') # file_name = '%s/%s.html' % (self.dir_game, id) # self.save_page(file_name, response.body) if u'Please enter your birth date to continue' in response.body: self.log('Please enter your birth date to continue meta:%s' % response.meta) url = 'http://store.steampowered.com/agecheck/app/%s/' % str(id) return FormRequest(url=url, dont_filter=True, method='POST', formdata={ 'ageDay': str(range(1, 25)), 'ageMonth': 'January', 'ageYear': str(range(1980, 1995)), 'snr': '1_agecheck_agecheck__age-gate', }, callback=self.parse_game) soup = BeautifulSoup(response.body, 'lxml') sel = Selector(text=response.body) name = sel.xpath( '//div[@class="apphub_AppName"]/text()').extract_first() if name == '' or name == None: self.log('no get data meta:%s' % response.meta) return price = sel.xpath('//div[@class="game_purchase_price price"]/text()' ).extract_first() try: p = price.split('¥') price = int(p[1]) except: price = -1 # 该游戏在 metacritic 上的评分 metacritic_score = sel.xpath( '//div[@class="score high"]/text()').extract_first() try: metacritic_score = int(metacritic_score) except: metacritic_score = -1 # 所有用户回复数量 user_reviews_count = sel.xpath( '//label[@for="review_type_all"]/span/text()').extract_first() user_reviews_count = self.count_to_int(user_reviews_count) # 好评的用户数量 positive_user_reviews_count = sel.xpath( '//label[@for="review_type_positive"]/span/text()').extract_first( ) positive_user_reviews_count = self.count_to_int( positive_user_reviews_count) # 好评的百分比 if user_reviews_count != -1 and positive_user_reviews_count != -1: positive_percent = positive_user_reviews_count * 1.0 / user_reviews_count * 100 else: positive_percent = 0 # 差评的用户数量 negative_user_reviews_count = sel.xpath( '//label[@for="review_type_negative"]/span/text()').extract_first( ) negative_user_reviews_count = self.count_to_int( negative_user_reviews_count) # 在 steam 购买的用户的评论数 steam_user_reviews_count = sel.xpath( '//label[@for="purchase_type_steam"]/span/text()').extract_first() steam_user_reviews_count = self.count_to_int(steam_user_reviews_count) # 在其他平台购买的用户的评论数 non_steam_user_reviews_count = sel.xpath( '//label[@for="purchase_type_non_steam"]/span/text()' ).extract_first() non_steam_user_reviews_count = self.count_to_int( non_steam_user_reviews_count) # 英语评论的数量 english_user_reviews_count = sel.xpath( '//label[@for="review_language_mine"]/span/text()').extract_first( ) english_user_reviews_count = self.count_to_int( english_user_reviews_count) # 非英语的评论数量 non_english_user_reviews_count = user_reviews_count - english_user_reviews_count # 该游戏的标签列表 try: tags = soup.find(attrs={'class': 'glance_tags popular_tags'}) tag_list = tags.text.replace('\t', '') tag_list = tag_list.replace('\n', ',') except: tag_list = '' # 该游戏的成就数量 achievements = sel.xpath( '//div[@id="achievement_block"]/div/text()').extract_first() try: achievements_count = re.search('\d+', achievements, re.S).group(0) achievements_count = int(achievements_count) except: achievements_count = 0 # 该游戏的分类 All Games > Action Games > Counter-Strike try: category = soup.find(name='div', attrs={ 'class': 'breadcrumbs' }).text category = category.replace('\t', '') category = category.replace('\n', '') except: category = '' # 游戏类型 genre = sel.xpath( '//div[@class="block_content"]/div/div/a/text()').extract_first() # 游戏开发商 developer = sel.xpath( '//div[@class="block_content"]/div/div/a[2]/text()').extract_first( ) # 游戏发行商 publisher = sel.xpath( '//div[@class="block_content"]/div/div/a[3]/text()').extract_first( ) # 游戏发行日期 release_date = sel.xpath( '//div[@class="release_date"]/span/text()').extract_first() # 游戏支持的语言 language_number = len( sel.xpath( '//table[@class="game_language_options"]/tr').extract()) - 1 # 游戏描述 description = sel.xpath( '//div[@class="game_description_snippet"]/text()').extract_first() # 抓取该游戏时间 save_time = None msg = (id, name, price, response.url, metacritic_score, user_reviews_count, positive_user_reviews_count, positive_percent, negative_user_reviews_count, steam_user_reviews_count, non_steam_user_reviews_count, english_user_reviews_count, non_english_user_reviews_count, tag_list, achievements_count, category, genre, developer, publisher, release_date, language_number, description, save_time) command = ( "INSERT IGNORE INTO {} " "(id, name, price, url, metacritic_score, user_reviews_count, positive_user_reviews_count, " "positive_percent, negative_user_reviews_count, steam_user_reviews_count, " "non_steam_user_reviews_count, english_user_reviews_count, non_english_user_reviews_count, " "tag_list, achievements_count, category, genre, developer, publisher, release_date, " "language_number, description, save_time)" "VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " "%s)".format(config.steam_game_info_table)) self.sql.insert_data(command, msg) command = "UPDATE {0} SET is_crawled=\'yes\' WHERE id=\'{1}\'".format( config.steam_game_urls_table, id) self.sql.execute(command) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def get_id(self, url): type = '' if '/sub/' in url: pattern = re.compile('/sub/(\d+)/') type = 'sub' elif '/app/' in url: pattern = re.compile('/app/(\d+)/', re.S) type = 'app' elif '/bundle/' in url: pattern = re.compile('/bundle/(\d+)/', re.S) type = 'bundle' else: pattern = re.compile('/(\d+)/', re.S) type = 'other' utils.log('get_id other url:%s' % url) id = re.search(pattern, url) if id: id = id.group(1) return id self.error_count = self.error_count + 1 utils.log('get_id error url:%s' % url) return -self.error_count def count_to_int(self, data): try: ret = data ret = ret.replace('(', '') ret = ret.replace(')', '') ret = ret.replace(',', '') return int(ret) except: return -1 def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()
class Crawler(object): def __init__(self): super(Crawler, self).__init__() self.album_prefix = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20={0}&page={1}' self.image_prefix = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id={0}&album_id={1}&page={2}' self.image_pattern = re.compile('''img.*290x10000.jpg''', re.U) self.image_name_pattern = re.compile('''"picId":"(.*?)"''', re.U) self.model_pattern = re.compile( '''<a class="lady-name" href="(.*?)".*>(.*?)</a>''', re.U) self.album_pattern = re.compile('''.*album_id=(.*?)&.*''', re.U) self.links = [] self.ids = [] self.names = [] self.sql = SqlHelper() def readHtml(self, html): response = urllib2.urlopen(html) return response.read() def getLinkIdAndNames(self, htmlData): items = re.findall(self.model_pattern, htmlData) self.links = [link for link, name in items] self.names = [name.decode('gbk') for link, name in items] self.ids = [link[link.index('=') + 1:] for link in self.links] def getAlbums(self): for i, model_id in enumerate(self.ids): utils.log('start downloading:%s' % self.names[i]) # print 'start downloading', self.names[i] # 插入用户 command = self.sql.insert_data_to_users() msg = ( model_id, self.names[i], "", ) try: self.sql.insert_data(command, msg, commit=True) except Exception, e: utils.log('insert users data errors') for page in xrange(1, 10): utils.log('current page:%s' % page) # print 'current page', page model_url = self.album_prefix.format(model_id, page) soup = bs(self.readHtml(model_url), 'html.parser') albums = soup.find_all('div', class_='mm-photo-cell-middle') if not albums: break for album in albums: album_name = album.find('h4').a.string.strip().rstrip('.') album_link = album.find('h4').a['href'] album_id = re.findall(self.album_pattern, album_link)[0] album_create_time = album.find( 'p', class_='mm-photo-date').string.strip( u'创建时间: ').strip(u'´´½¨Ê±¼ä:') album_img_count = album.find( 'span', class_='mm-pic-number').string.strip( '()').strip(u'张').strip(u'ÕÅ') # print ">>>>>>>>>>>>>>>>>>>>>>" # print album.find('p', class_ = 'mm-photo-date').string # print album_create_time # print ">>>>>>>>>>>>>>>>>>>>>>" # 插入相册 command = self.sql.insert_data_to_albums() msg = (album_id, model_id, album_name, album_create_time, "", 1, album_img_count) try: self.sql.insert_data(command, msg, commit=True) except Exception, e: utils.log('insert albums data errors') utils.log('start in album:%s, total size: %s' % (album_name, album_img_count)) self.getImages(model_id, album_id, album_img_count)