def randitem(spargs): guid = spargs.get('guid', 0) utils.push_redis(guid, 0, '正在随机产生商品链接', save_to_mysql=False) url = 'https://diviner.taobao.com/diviner?p=610009&callback=jsonpCallbackMoreGood&lid=1&uuid=122270672' \ '.1492415671516609876050.1492415672.1492415672.1492415672.1&pin=&lim=100&ec=utf-8&_=1492415813682' headers = { 'Host': 'diviner.taobao.com', 'Referer': 'https://www.taobao.com/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0' } cookies = { '__jda': '122270672.1492415671516609876050.1492415672.1492415672.1492415672.1', '__jdb': '122270672.1.1492415671516609876050|1.1492415672', '__jdc': '122270672', '__jdv': '122270672|direct|-|none|-|1492415671524', '__jdu': '1492415671516609876050', } r = requests.get(url=url, headers=headers, cookies=cookies, timeout=20) pattern = re.compile('"sku":(\d+),', re.S) ids = re.findall(pattern, r.text) id = random.choice(ids) url = 'https://item.taobao.com/%s.html' % str(id) utils.push_redis(guid, 0, '生成商品链接:<a href="%s" target="_blank">%s' % (url, url), save_to_mysql=False) sql = SqlHelper() command = "SELECT id FROM {table} WHERE id={product_id}". \ format(table = config.tb_item_table, product_id = id) result = sql.query_one(command) # 如果数据库中没有,则重新抓取 if result == None: cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a product_id={product_id} -a url={url};'. \ format(url = str(url), name = 'tb', dir = settings.BASE_DIR, guid = guid, product_id = id) subprocess.Popen(cmd, shell=True) else: # 如果数据库中存在则,直接读取数据库中数据 command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \ format(config.analysis_item_table, id) result = sql.query(command) for res in result: utils.push_redis(guid, res[1], res[2], res[3], save_to_mysql=False)
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') command = "SELECT * FROM {0}".format(name) result = sql.query(command) data = [{ 'ip': item[1], 'port': item[2], 'speed': item[6] } for item in result] data = json.dumps(data, indent=4) return data except: pass return []
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') anonymity = inputs.get('anonymity', None) https = inputs.get('https', None) order = inputs.get('order', 'speed') sort = inputs.get('sort', 'asc') count = inputs.get('count', 100) command = '' if anonymity is None and https is None: command = "SELECT * FROM {name} ORDER BY {order} {sort} LIMIT {count}". \ format(name = name, order = order, sort = sort, count = count) elif anonymity is not None and https is None: command = "SELECT * FROM {name} WHERE anonymity=\'{anonymity}\' ORDER BY {order} {sort} " \ "LIMIT {count}". \ format(name = name, anonymity = anonymity, order = order, sort = sort, count = count) elif anonymity is None and https is not None: command = "SELECT * FROM {name} WHERE https=\'{https}\' ORDER BY {order} {sort} LIMIT {count}". \ format(name = name, https = https, order = order, sort = sort, count = count) elif anonymity is not None and https is not None: command = "SELECT * FROM {name} WHERE anonymity=\'{anonymity}\' AND https=\'{https}\' ORDER BY " \ "{order} {sort} limit {count}". \ format(name = name, anonymity = anonymity, https = https, order = order, sort = sort, count = count) result = sql.query(command) data = [{ 'id': item[0], 'ip': item[1], 'port': item[2], 'anonymity': item[4], 'https': item[5], 'speed': item[6], 'save_time': str(item[8]) } for item in result] data = json.dumps(data, indent=4) return data except Exception, e: utils.log('select exception msg:%s' % e) pass
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') anonymity = inputs.get('anonymity', None) https = inputs.get('https', None) sort = inputs.get('sort', 'speed') count = inputs.get('count', 100) command = '' if anonymity is None and https is None: command = "SELECT * FROM {0} ORDER BY {1} LIMIT {2}".format( name, sort, count) elif anonymity is not None and https is None: command = "SELECT * FROM {0} WHERE anonymity=\'{1}\' ORDER BY {2} LIMIT {3}". \ format(name, anonymity, sort, count) elif anonymity is None and https is not None: command = "SELECT * FROM {0} WHERE https=\'{1}\' ORDER BY {2} LIMIT {3}". \ format(name, https, sort, count) elif anonymity is not None and https is not None: command = "SELECT * FROM {0} WHERE anonymity=\'{1}\' AND https=\'{2}\' ORDER BY {3} limit {4}". \ format(name, anonymity, https, sort, count) result = sql.query(command) data = [{ 'ip': item[1], 'port': item[2], 'speed': item[6] } for item in result] data = json.dumps(data, indent=4) return data except: pass return []
class RecipeDetail(CrawlSpider): name = "recipe_detail" base_url = 'https://www.xiachufang.com' header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive', 'Host': 'www.xiachufang.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', } def __init__(self, *a , **kw): super(RecipeDetail, self).__init__(*a, **kw) self.dir_name = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_name) def init(self): command = ( "CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`name` CHAR(20) NOT NULL COMMENT 'recipe name'," "`recipe_id` INT(12) NOT NULL COMMENT 'recipe ID'," "`source_name` CHAR(20) NOT NULL COMMENT 'source name'," "`source_id` INT(8) NOT NULL COMMENT 'source ID'," "`create_time` DATETIME NOT NULL," "PRIMARY KEY(id)" ") ENGINE=InnoDB".format(config.item_detail_table) ) self.sql.create_table(command) def start_requests(self): command = "SELECT * from {}".format(config.item_list_table) data = self.sql.query(command) for i, recipe in enumerate(data): if recipe[0] > 8999 and recipe[0] < 10000: url = self.base_url + recipe[2] utils.log(url) yield Request( url = url, headers = self.header, callback = self.parse_all, errback = self.error_parse, meta={"re_id": recipe[3], "re_name": recipe[1]}, dont_filter = True, ) def parse_all(self, response): utils.log(response.url) if response.status == 429: raise CloseSpider('Too much request, IP banned') if response.status == 200: file_name = '%s/recipe.html' % (self.dir_name) self.save_page(file_name, response.body) sources = response.xpath("//div[@class='ings']//tr").extract() for source in sources: sel = Selector(text = source) source_name = sel.xpath("//a/text()").extract_first() url = sel.xpath("//a/@href").extract_first() if source_name is not None and url is not None: source_id = url.split('/')[-2] r_name = response.meta["re_name"] r_id = response.meta["re_id"] dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") msg = (None, r_name, r_id, source_name, source_id, dt) command = ("INSERT IGNORE INTO {} " "(id, name, recipe_id, source_name, source_id, create_time)" "VALUES(%s,%s,%s,%s,%s,%s)".format(config.item_detail_table) ) self.sql.insert_data(command, msg) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()
def runspider(request): data = { 'status': 'failure', 'guid': '0', 'info': '', } try: # 正式环境用 post 请求 url = request.POST.get('url') force = request.POST.get('force', 'false') pattern = re.compile('\d+', re.S) product_id = re.search(pattern, url).group() sql = SqlHelper() utils.log('product_id:%s' % product_id) if 'item.jd.com' in url and product_id != None: data['status'] = 'success' data['guid'] = str(uuid.uuid4()) data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现', command = "SELECT id FROM {table} WHERE id={product_id}". \ format(table = config.jd_item_table, product_id = product_id) result = sql.query_one(command) if result == None: name = 'jd' cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a product_id={product_id} -a url={url};'. \ format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'), product_id = product_id) subprocess.Popen(cmd, shell=True) else: if force == 'false': utils.log('数据库中存在数据,从数据库中取出分析结果') command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \ format(config.analysis_item_table, product_id) result = sql.query(command) for res in result: utils.push_redis(data.get('guid'), res[1], res[2], res[3], save_to_mysql=False) else: command = "DELETE FROM {0} WHERE produce_id={1}".format( config.analysis_item_table, product_id) sql.execute(command) #重新分析数据 cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \ 'product_id={product_id};'. \ format(url = url, name = 'jd', dir = settings.BASE_DIR, guid = data.get('guid'), product_id = product_id) subprocess.Popen(cmd, shell=True) else: data[ 'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://item.jd.com/3995645.html' except Exception, e: logging.error('run spider exception:%s' % e) data['info'] = '出现错误,错误原因:%s' % e
class ItemDetail(CrawlSpider): name = "item_list" base_url = "http://www.xiachufang.com" header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive', 'Host': 'www.xiachufang.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', } def __init__(self, *a, **kw): super(ItemDetail, self).__init__(*a, **kw) self.dir_name = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_name) def init(self): command = ("CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`name` CHAR(20) NOT NULL COMMENT '菜肴名称'," "`url` TEXT NOT NULL COMMENT '菜肴url'," "`img` TEXT NOT NULL COMMENT '封面图'," "`item_id` INT(8) NOT NULL COMMENT '菜肴id'," "`source` TEXT NOT NULL COMMENT '原料'," "`score` CHAR(5) NOT NULL COMMENT '平分'," "`create_time` DATETIME NOT NULL," "PRIMARY KEY(id)," "UNIQUE KEY `item_id` (`item_id`)" ") ENGINE=InnoDB".format(config.item_list_table)) self.sql.create_table(command) def start_requests(self): command = "SELECT * from {}".format(config.category_urls_table) data = self.sql.query(command) for i, category in enumerate(data): url = self.base_url + category[2] utils.log(url) yield Request( url=url, headers=self.header, callback=self.parse_all, errback=self.error_parse, ) def parse_all(self, response): utils.log(response.url) if response.status == 200: file_name = '%s/category.html' % (self.dir_name) self.save_page(file_name, response.body) recipes = response.xpath( "//div[@class='normal-recipe-list']/ul/li").extract() self.parse_recipes(recipes) nextPage = response.xpath( "//div[@class='pager']/a[@class='next']/@href").extract_first( ) if nextPage: yield Request( url=self.base_url + nextPage, headers=self.header, callback=self.parse_all, errback=self.error_parse, ) def parse_recipes(self, recipes): for recipe in recipes: sel = Selector(text=recipe) name = sel.xpath( "//p[@class='name']/text()").extract_first().strip() url = sel.xpath("//a[1]/@href").extract_first() img = sel.xpath( "//div[@class='cover pure-u']/img/@data-src").extract_first() item_id = re.compile("/recipe/(.*?)/").findall(url)[0] source = sel.xpath( "//p[@class='ing ellipsis']/text()").extract_first().strip() score = sel.xpath( "//p[@class='stats']/span/text()").extract_first().strip() dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") msg = (None, name, url, img, item_id, source, score, dt) command = ( "INSERT IGNORE INTO {} " "(id, name, url, img, item_id, source, score, create_time)" "VALUES(%s,%s,%s,%s,%s,%s,%s,%s)".format( config.item_list_table)) self.sql.insert_data(command, msg) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()
def runspider(request): data = { 'status': 'failure', 'guid': '0', 'info': '', } try: # 正式环境用 post 请求 url = request.POST.get('url') force = request.POST.get('force', 'false') pattern = re.compile('user-rate-') urls = re.split(pattern, url) user_id = urls[1] pattern = re.compile('\w+', re.S) user_id = re.search(pattern, user_id).group() sql = SqlHelper() utils.log('user_id:%s' % user_id) if 'rate.taobao.com' in url and user_id != None: data['status'] = 'success' data['guid'] = str(random.randint(1000000000000, 9999999999999)) + '_' + str( random.randint(100, 999)) data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现', command = "SELECT id FROM {table} WHERE id={user_id}". \ format(table = config.tb_item_table, user_id = user_id) result = sql.query_one(command) if result == None: name = 'tb_comment' cmd = 'python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a user_id={user_id} -a url={url};'. \ format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'), user_id = user_id) logging.warn(cmd) subprocess.Popen(cmd, shell=True) else: if force == 'false': utils.log('数据库中存在数据,从数据库中取出分析结果') command = "SELECT * FROM {0} WHERE user_id={1} ORDER BY id". \ format(config.analysis_item_table, user_id) result = sql.query(command) for res in result: utils.push_redis(data.get('guid'), res[1], res[2], res[3], save_to_mysql=False) else: command = "DELETE FROM {0} WHERE produce_id={1}".format( config.analysis_item_table, user_id) sql.execute(command) #重新分析数据 cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \ 'user_id={user_id};'. \ format(url = url, name = 'tb', dir = settings.BASE_DIR, guid = data.get('guid'), user_id = user_id) subprocess.Popen(cmd, shell=True) else: data[ 'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://rate.taobao.com/user-rate-UvGv0MFc0vFILvgTT.htm' except Exception, e: logging.error('run spider exception:%s' % e) data['info'] = '出现错误,错误原因:%s' % e
class Recipe(CrawlSpider): name = "user_recipes" base_url = 'https://www.xiachufang.com' header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive', 'Host': 'www.xiachufang.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', } def __init__(self, *a , **kw): super(Recipe, self).__init__(*a, **kw) self.dir_name = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_name) def init(self): command = ( "CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`name` CHAR(20) NOT NULL COMMENT 'recipe name'," "`url` TEXT NOT NULL COMMENT 'recipe url'," "`item_id` INT(8) NOT NULL COMMENT 'recipe ID'," "`user_id` INT(12) NOT NULL COMMENT 'user ID'," "`create_time` DATETIME NOT NULL," "PRIMARY KEY(id)," "UNIQUE KEY `item_id` (`item_id`)" ") ENGINE=InnoDB".format(config.item_list_table) ) self.sql.create_table(command) def start_requests(self): command = "SELECT * from {}".format(config.users_urls_table) data = self.sql.query(command) for i, user in enumerate(data): if i > 200: page = 1 url = self.base_url + user[3] + 'created/?page=%d' % page utils.log(url) yield Request( url = url, headers = self.header, meta = {"page":page, "user_id":user[2], "user_url":user[3]}, callback = self.parse_all, errback = self.error_parse, ) def parse_all(self, response): utils.log(response.url) if response.status == 200: file_name = '%s/user.html' % (self.dir_name) self.save_page(file_name, response.body) recipes = response.xpath("//div[@class='recipes-280-full-width-list']/ul/li").extract() page = response.meta["page"] u_url = response.meta["user_url"] u_id = response.meta["user_id"] for recipe in recipes: sel = Selector(text = recipe) name = sel.xpath("//p[@class='name ellipsis red-font']/a/text()").extract_first().strip() url = sel.xpath("//p[@class='name ellipsis red-font']/a/@href").extract_first() item_id = url.split('/')[-2] dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") msg = (None, name, url, item_id, u_id, dt) command = ("INSERT IGNORE INTO {} " "(id, name, url, item_id, user_id, create_time)" "VALUES(%s,%s,%s,%s,%s,%s)".format(config.item_list_table) ) self.sql.insert_data(command, msg) page += 1 if page < 3: yield Request( url = self.base_url + u_url + 'created/?page=%d' % page, meta = {"page":page, "user_id":u_id, "user_url":u_url}, headers = self.header, callback = self.parse_all, errback = self.error_parse, dont_filter = True, ) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()
csvFile3 = open('recipe_name.csv','w') csvFile4 = open('source_name.csv','w') writer1 = csv.writer(csvFile1) writer1.writerow(['user_id', 'item_id']) writer2 = csv.writer(csvFile2) writer2.writerow(['recipe_id', 'source_id']) writer3 = csv.writer(csvFile3) writer3.writerow(['recipe_id', 'recipe_name']) writer4 = csv.writer(csvFile4) writer4.writerow(['source_id', 'source_name']) sql = SqlHelper() command = "SELECT user_id, item_id FROM {}".format(config.item_list_table) data = sql.query(command) for i, recipe in enumerate(data): writer1.writerow([recipe[0], recipe[1]]) csvFile1.close() command = "SELECT item_id, name FROM {}".format(config.item_list_table) data1 = sql.query(command) for i, item in enumerate(data1): # name_en = translator.translate(item[1]).text writer3.writerow([item[0], item[1].encode('utf8')]) csvFile3.close()
class GameInfo(CrawlSpider): name = 'game_info' def __init__(self, *a, **kw): super(GameInfo, self).__init__(*a, **kw) self.dir_game = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_game) self.error_count = 0 def init(self): command = ("CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`name` TEXT NOT NULL," "`price` INT(5) NOT NULL," "`metacritic_score` FLOAT DEFAULT NULL," "`user_reviews_count` INT(6) NOT NULL," "`positive_user_reviews_count` INT(6) NOT NULL," "`positive_percent` FLOAT NOT NULL ," "`negative_user_reviews_count` INT(6) NOT NULL," '`steam_user_reviews_count` INT(6) NOT NULL,' '`non_steam_user_reviews_count` INT(6) NOT NULL,' '`english_user_reviews_count` INT(6) NOT NULL,' '`non_english_user_reviews_count` INT(6) NOT NULL,' "`tag_list` TEXT DEFAULT NULL," "`achievements_count` INT(4) DEFAULT NULL," "`category` TEXT NOT NULL," "`genre` TEXT NOT NULL," "`developer` TEXT NOT NULL," "`publisher` TEXT NOT NULL," "`release_date` TEXT NOT NULL," "`url` TEXT NOT NULL," "`language_number` INT(3) DEFAULT NULL," "`description` TEXT DEFAULT NULL," "`save_time` TIMESTAMP NOT NULL," "PRIMARY KEY(id)" ") ENGINE=InnoDB".format(config.steam_game_info_table)) self.sql.create_table(command) def start_requests(self): command = "SELECT * FROM {} WHERE is_crawled = \'no\' AND type = \'app\'".format( config.steam_game_urls_table) data = self.sql.query(command) for i, item in enumerate(data): yield Request( url=item[3], dont_filter=True, method='GET', headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Host': 'store.steampowered.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 ' 'Firefox/51.0', }, meta={ 'item': item, 'id': item[0], }, cookies={ 'mature_content': '1', }, callback=self.parse_game, errback=self.error_parse, ) def parse_game(self, response): self.log('parse_game url:%s' % response.url) id = response.meta.get('id') # file_name = '%s/%s.html' % (self.dir_game, id) # self.save_page(file_name, response.body) if u'Please enter your birth date to continue' in response.body: self.log('Please enter your birth date to continue meta:%s' % response.meta) url = 'http://store.steampowered.com/agecheck/app/%s/' % str(id) return FormRequest(url=url, dont_filter=True, method='POST', formdata={ 'ageDay': str(range(1, 25)), 'ageMonth': 'January', 'ageYear': str(range(1980, 1995)), 'snr': '1_agecheck_agecheck__age-gate', }, callback=self.parse_game) soup = BeautifulSoup(response.body, 'lxml') sel = Selector(text=response.body) name = sel.xpath( '//div[@class="apphub_AppName"]/text()').extract_first() if name == '' or name == None: self.log('no get data meta:%s' % response.meta) return price = sel.xpath('//div[@class="game_purchase_price price"]/text()' ).extract_first() try: p = price.split('¥') price = int(p[1]) except: price = -1 # 该游戏在 metacritic 上的评分 metacritic_score = sel.xpath( '//div[@class="score high"]/text()').extract_first() try: metacritic_score = int(metacritic_score) except: metacritic_score = -1 # 所有用户回复数量 user_reviews_count = sel.xpath( '//label[@for="review_type_all"]/span/text()').extract_first() user_reviews_count = self.count_to_int(user_reviews_count) # 好评的用户数量 positive_user_reviews_count = sel.xpath( '//label[@for="review_type_positive"]/span/text()').extract_first( ) positive_user_reviews_count = self.count_to_int( positive_user_reviews_count) # 好评的百分比 if user_reviews_count != -1 and positive_user_reviews_count != -1: positive_percent = positive_user_reviews_count * 1.0 / user_reviews_count * 100 else: positive_percent = 0 # 差评的用户数量 negative_user_reviews_count = sel.xpath( '//label[@for="review_type_negative"]/span/text()').extract_first( ) negative_user_reviews_count = self.count_to_int( negative_user_reviews_count) # 在 steam 购买的用户的评论数 steam_user_reviews_count = sel.xpath( '//label[@for="purchase_type_steam"]/span/text()').extract_first() steam_user_reviews_count = self.count_to_int(steam_user_reviews_count) # 在其他平台购买的用户的评论数 non_steam_user_reviews_count = sel.xpath( '//label[@for="purchase_type_non_steam"]/span/text()' ).extract_first() non_steam_user_reviews_count = self.count_to_int( non_steam_user_reviews_count) # 英语评论的数量 english_user_reviews_count = sel.xpath( '//label[@for="review_language_mine"]/span/text()').extract_first( ) english_user_reviews_count = self.count_to_int( english_user_reviews_count) # 非英语的评论数量 non_english_user_reviews_count = user_reviews_count - english_user_reviews_count # 该游戏的标签列表 try: tags = soup.find(attrs={'class': 'glance_tags popular_tags'}) tag_list = tags.text.replace('\t', '') tag_list = tag_list.replace('\n', ',') except: tag_list = '' # 该游戏的成就数量 achievements = sel.xpath( '//div[@id="achievement_block"]/div/text()').extract_first() try: achievements_count = re.search('\d+', achievements, re.S).group(0) achievements_count = int(achievements_count) except: achievements_count = 0 # 该游戏的分类 All Games > Action Games > Counter-Strike try: category = soup.find(name='div', attrs={ 'class': 'breadcrumbs' }).text category = category.replace('\t', '') category = category.replace('\n', '') except: category = '' # 游戏类型 genre = sel.xpath( '//div[@class="block_content"]/div/div/a/text()').extract_first() # 游戏开发商 developer = sel.xpath( '//div[@class="block_content"]/div/div/a[2]/text()').extract_first( ) # 游戏发行商 publisher = sel.xpath( '//div[@class="block_content"]/div/div/a[3]/text()').extract_first( ) # 游戏发行日期 release_date = sel.xpath( '//div[@class="release_date"]/span/text()').extract_first() # 游戏支持的语言 language_number = len( sel.xpath( '//table[@class="game_language_options"]/tr').extract()) - 1 # 游戏描述 description = sel.xpath( '//div[@class="game_description_snippet"]/text()').extract_first() # 抓取该游戏时间 save_time = None msg = (id, name, price, response.url, metacritic_score, user_reviews_count, positive_user_reviews_count, positive_percent, negative_user_reviews_count, steam_user_reviews_count, non_steam_user_reviews_count, english_user_reviews_count, non_english_user_reviews_count, tag_list, achievements_count, category, genre, developer, publisher, release_date, language_number, description, save_time) command = ( "INSERT IGNORE INTO {} " "(id, name, price, url, metacritic_score, user_reviews_count, positive_user_reviews_count, " "positive_percent, negative_user_reviews_count, steam_user_reviews_count, " "non_steam_user_reviews_count, english_user_reviews_count, non_english_user_reviews_count, " "tag_list, achievements_count, category, genre, developer, publisher, release_date, " "language_number, description, save_time)" "VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " "%s)".format(config.steam_game_info_table)) self.sql.insert_data(command, msg) command = "UPDATE {0} SET is_crawled=\'yes\' WHERE id=\'{1}\'".format( config.steam_game_urls_table, id) self.sql.execute(command) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def get_id(self, url): type = '' if '/sub/' in url: pattern = re.compile('/sub/(\d+)/') type = 'sub' elif '/app/' in url: pattern = re.compile('/app/(\d+)/', re.S) type = 'app' elif '/bundle/' in url: pattern = re.compile('/bundle/(\d+)/', re.S) type = 'bundle' else: pattern = re.compile('/(\d+)/', re.S) type = 'other' utils.log('get_id other url:%s' % url) id = re.search(pattern, url) if id: id = id.group(1) return id self.error_count = self.error_count + 1 utils.log('get_id error url:%s' % url) return -self.error_count def count_to_int(self, data): try: ret = data ret = ret.replace('(', '') ret = ret.replace(')', '') ret = ret.replace(',', '') return int(ret) except: return -1 def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()