コード例 #1
0
def randitem(spargs):
    guid = spargs.get('guid', 0)
    utils.push_redis(guid, 0, '正在随机产生商品链接', save_to_mysql=False)

    url = 'https://diviner.taobao.com/diviner?p=610009&callback=jsonpCallbackMoreGood&lid=1&uuid=122270672' \
          '.1492415671516609876050.1492415672.1492415672.1492415672.1&pin=&lim=100&ec=utf-8&_=1492415813682'
    headers = {
        'Host':
        'diviner.taobao.com',
        'Referer':
        'https://www.taobao.com/',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    cookies = {
        '__jda':
        '122270672.1492415671516609876050.1492415672.1492415672.1492415672.1',
        '__jdb': '122270672.1.1492415671516609876050|1.1492415672',
        '__jdc': '122270672',
        '__jdv': '122270672|direct|-|none|-|1492415671524',
        '__jdu': '1492415671516609876050',
    }

    r = requests.get(url=url, headers=headers, cookies=cookies, timeout=20)
    pattern = re.compile('"sku":(\d+),', re.S)
    ids = re.findall(pattern, r.text)
    id = random.choice(ids)

    url = 'https://item.taobao.com/%s.html' % str(id)
    utils.push_redis(guid,
                     0,
                     '生成商品链接:<a href="%s" target="_blank">%s' % (url, url),
                     save_to_mysql=False)

    sql = SqlHelper()
    command = "SELECT id FROM {table} WHERE id={product_id}". \
        format(table = config.tb_item_table, product_id = id)
    result = sql.query_one(command)

    # 如果数据库中没有,则重新抓取
    if result == None:
        cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
              '-a product_id={product_id} -a url={url};'. \
            format(url = str(url), name = 'tb', dir = settings.BASE_DIR, guid = guid,
                   product_id = id)
        subprocess.Popen(cmd, shell=True)
    else:
        # 如果数据库中存在则,直接读取数据库中数据
        command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \
            format(config.analysis_item_table, id)
        result = sql.query(command)
        for res in result:
            utils.push_redis(guid, res[1], res[2], res[3], save_to_mysql=False)
コード例 #2
0
    def GET(self):
        try:
            sql = SqlHelper()

            inputs = web.input()
            name = inputs.get('name')
            command = "SELECT * FROM {0}".format(name)
            result = sql.query(command)
            data = [{
                'ip': item[1],
                'port': item[2],
                'speed': item[6]
            } for item in result]
            data = json.dumps(data, indent=4)
            return data
        except:
            pass

        return []
コード例 #3
0
ファイル: dataserver.py プロジェクト: zhuozecheng/IPProxyTool
    def GET(self):
        try:
            sql = SqlHelper()
            inputs = web.input()
            name = inputs.get('name')
            anonymity = inputs.get('anonymity', None)
            https = inputs.get('https', None)
            order = inputs.get('order', 'speed')
            sort = inputs.get('sort', 'asc')
            count = inputs.get('count', 100)

            command = ''
            if anonymity is None and https is None:
                command = "SELECT * FROM {name} ORDER BY {order} {sort} LIMIT {count}". \
                    format(name = name, order = order, sort = sort, count = count)
            elif anonymity is not None and https is None:
                command = "SELECT * FROM {name} WHERE anonymity=\'{anonymity}\' ORDER BY {order} {sort} " \
                          "LIMIT {count}". \
                    format(name = name, anonymity = anonymity, order = order, sort = sort, count = count)
            elif anonymity is None and https is not None:
                command = "SELECT * FROM {name} WHERE https=\'{https}\' ORDER BY {order} {sort} LIMIT {count}". \
                    format(name = name, https = https, order = order, sort = sort, count = count)
            elif anonymity is not None and https is not None:
                command = "SELECT * FROM {name} WHERE anonymity=\'{anonymity}\' AND https=\'{https}\' ORDER BY " \
                          "{order} {sort} limit {count}". \
                    format(name = name, anonymity = anonymity, https = https, order = order, sort = sort, count = count)
            result = sql.query(command)
            data = [{
                'id': item[0],
                'ip': item[1],
                'port': item[2],
                'anonymity': item[4],
                'https': item[5],
                'speed': item[6],
                'save_time': str(item[8])
            } for item in result]

            data = json.dumps(data, indent=4)
            return data
        except Exception, e:
            utils.log('select exception msg:%s' % e)
            pass
コード例 #4
0
    def GET(self):
        try:
            sql = SqlHelper()

            inputs = web.input()
            name = inputs.get('name')
            anonymity = inputs.get('anonymity', None)
            https = inputs.get('https', None)
            sort = inputs.get('sort', 'speed')
            count = inputs.get('count', 100)

            command = ''
            if anonymity is None and https is None:
                command = "SELECT * FROM {0} ORDER BY {1} LIMIT {2}".format(
                    name, sort, count)
            elif anonymity is not None and https is None:
                command = "SELECT * FROM {0} WHERE anonymity=\'{1}\' ORDER BY {2} LIMIT {3}". \
                    format(name, anonymity, sort, count)
            elif anonymity is None and https is not None:
                command = "SELECT * FROM {0} WHERE https=\'{1}\' ORDER BY {2} LIMIT {3}". \
                    format(name, https, sort, count)
            elif anonymity is not None and https is not None:
                command = "SELECT * FROM {0} WHERE anonymity=\'{1}\' AND https=\'{2}\' ORDER BY {3} limit {4}". \
                    format(name, anonymity, https, sort, count)

            result = sql.query(command)
            data = [{
                'ip': item[1],
                'port': item[2],
                'speed': item[6]
            } for item in result]
            data = json.dumps(data, indent=4)
            return data
        except:
            pass

        return []
コード例 #5
0
class RecipeDetail(CrawlSpider):
    name = "recipe_detail"

    base_url = 'https://www.xiachufang.com'
    
    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Connection': 'keep-alive',
        'Host': 'www.xiachufang.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
    }

    

    def __init__(self, *a , **kw):
        super(RecipeDetail, self).__init__(*a, **kw)

        self.dir_name = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()
        utils.make_dir(self.dir_name)


    def init(self):
        command = (
            "CREATE TABLE IF NOT EXISTS {} ("
            "`id` INT(8) NOT NULL AUTO_INCREMENT,"
            "`name` CHAR(20) NOT NULL COMMENT 'recipe name',"
            "`recipe_id` INT(12) NOT NULL COMMENT 'recipe ID',"
            "`source_name` CHAR(20) NOT NULL COMMENT 'source name',"
            "`source_id` INT(8) NOT NULL COMMENT 'source ID',"
            "`create_time` DATETIME NOT NULL,"
            "PRIMARY KEY(id)"
            ") ENGINE=InnoDB".format(config.item_detail_table)
        )

        self.sql.create_table(command)

    def start_requests(self):
        command = "SELECT * from {}".format(config.item_list_table)
        data = self.sql.query(command)

        for i, recipe in enumerate(data):
            if recipe[0] > 8999 and recipe[0] < 10000:
                url = self.base_url + recipe[2]
                utils.log(url)
                yield Request(
                    url = url,
                    headers = self.header,
                    callback = self.parse_all,
                    errback = self.error_parse,
                    meta={"re_id": recipe[3], "re_name": recipe[1]},
                    dont_filter = True,
                )


    def parse_all(self, response):
        utils.log(response.url)
        if response.status == 429:
            raise CloseSpider('Too much request, IP banned')
        if response.status == 200:
            file_name = '%s/recipe.html' % (self.dir_name)
            self.save_page(file_name, response.body)
            sources = response.xpath("//div[@class='ings']//tr").extract()

            for source in sources:
                sel = Selector(text = source)
                
                source_name = sel.xpath("//a/text()").extract_first()
                url = sel.xpath("//a/@href").extract_first()
                if source_name is not None and url is not None:
                    source_id = url.split('/')[-2]
                    r_name = response.meta["re_name"]
                    r_id = response.meta["re_id"]
                    dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    msg = (None, r_name, r_id, source_name, source_id, dt)
                    command = ("INSERT IGNORE INTO {} "
                                "(id, name, recipe_id, source_name, source_id, create_time)"
                                "VALUES(%s,%s,%s,%s,%s,%s)".format(config.item_detail_table)
                    )
                    self.sql.insert_data(command, msg)


    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))


    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()
コード例 #6
0
ファイル: views.py プロジェクト: ztg498/jd_analysis
def runspider(request):
    data = {
        'status': 'failure',
        'guid': '0',
        'info': '',
    }

    try:
        # 正式环境用 post 请求
        url = request.POST.get('url')
        force = request.POST.get('force', 'false')
        pattern = re.compile('\d+', re.S)
        product_id = re.search(pattern, url).group()
        sql = SqlHelper()

        utils.log('product_id:%s' % product_id)

        if 'item.jd.com' in url and product_id != None:
            data['status'] = 'success'
            data['guid'] = str(uuid.uuid4())
            data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现',

            command = "SELECT id FROM {table} WHERE id={product_id}". \
                format(table = config.jd_item_table, product_id = product_id)
            result = sql.query_one(command)

            if result == None:
                name = 'jd'
                cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
                      '-a product_id={product_id} -a url={url};'. \
                    format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'),
                           product_id = product_id)

                subprocess.Popen(cmd, shell=True)
            else:
                if force == 'false':
                    utils.log('数据库中存在数据,从数据库中取出分析结果')
                    command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \
                        format(config.analysis_item_table, product_id)
                    result = sql.query(command)
                    for res in result:
                        utils.push_redis(data.get('guid'),
                                         res[1],
                                         res[2],
                                         res[3],
                                         save_to_mysql=False)
                else:
                    command = "DELETE FROM {0} WHERE produce_id={1}".format(
                        config.analysis_item_table, product_id)
                    sql.execute(command)
                    #重新分析数据
                    cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \
                          'product_id={product_id};'. \
                        format(url = url, name = 'jd', dir = settings.BASE_DIR, guid = data.get('guid'),
                               product_id = product_id)

                    subprocess.Popen(cmd, shell=True)
        else:
            data[
                'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://item.jd.com/3995645.html'
    except Exception, e:
        logging.error('run spider exception:%s' % e)
        data['info'] = '出现错误,错误原因:%s' % e
コード例 #7
0
ファイル: item.py プロジェクト: sam408130/base_spider
class ItemDetail(CrawlSpider):
    name = "item_list"
    base_url = "http://www.xiachufang.com"
    header = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate, sdch',
        'Accept-Language':
        'zh-CN,zh;q=0.8,en;q=0.6',
        'Connection':
        'keep-alive',
        'Host':
        'www.xiachufang.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
    }

    def __init__(self, *a, **kw):
        super(ItemDetail, self).__init__(*a, **kw)

        self.dir_name = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()
        utils.make_dir(self.dir_name)

    def init(self):
        command = ("CREATE TABLE IF NOT EXISTS {} ("
                   "`id` INT(8) NOT NULL AUTO_INCREMENT,"
                   "`name` CHAR(20) NOT NULL COMMENT '菜肴名称',"
                   "`url` TEXT NOT NULL COMMENT '菜肴url',"
                   "`img` TEXT NOT NULL COMMENT '封面图',"
                   "`item_id` INT(8) NOT NULL COMMENT '菜肴id',"
                   "`source` TEXT NOT NULL COMMENT '原料',"
                   "`score` CHAR(5) NOT NULL COMMENT '平分',"
                   "`create_time` DATETIME NOT NULL,"
                   "PRIMARY KEY(id),"
                   "UNIQUE KEY `item_id` (`item_id`)"
                   ") ENGINE=InnoDB".format(config.item_list_table))

        self.sql.create_table(command)

    def start_requests(self):
        command = "SELECT * from {}".format(config.category_urls_table)
        data = self.sql.query(command)

        for i, category in enumerate(data):
            url = self.base_url + category[2]
            utils.log(url)
            yield Request(
                url=url,
                headers=self.header,
                callback=self.parse_all,
                errback=self.error_parse,
            )

    def parse_all(self, response):
        utils.log(response.url)
        if response.status == 200:
            file_name = '%s/category.html' % (self.dir_name)
            self.save_page(file_name, response.body)
            recipes = response.xpath(
                "//div[@class='normal-recipe-list']/ul/li").extract()
            self.parse_recipes(recipes)
            nextPage = response.xpath(
                "//div[@class='pager']/a[@class='next']/@href").extract_first(
                )
            if nextPage:
                yield Request(
                    url=self.base_url + nextPage,
                    headers=self.header,
                    callback=self.parse_all,
                    errback=self.error_parse,
                )

    def parse_recipes(self, recipes):
        for recipe in recipes:
            sel = Selector(text=recipe)
            name = sel.xpath(
                "//p[@class='name']/text()").extract_first().strip()
            url = sel.xpath("//a[1]/@href").extract_first()
            img = sel.xpath(
                "//div[@class='cover pure-u']/img/@data-src").extract_first()
            item_id = re.compile("/recipe/(.*?)/").findall(url)[0]
            source = sel.xpath(
                "//p[@class='ing ellipsis']/text()").extract_first().strip()
            score = sel.xpath(
                "//p[@class='stats']/span/text()").extract_first().strip()
            dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            msg = (None, name, url, img, item_id, source, score, dt)
            command = (
                "INSERT IGNORE INTO {} "
                "(id, name, url, img, item_id, source, score, create_time)"
                "VALUES(%s,%s,%s,%s,%s,%s,%s,%s)".format(
                    config.item_list_table))
            self.sql.insert_data(command, msg)

    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))

    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()
コード例 #8
0
ファイル: views.py プロジェクト: tyzctyzc/tb_analysis
def runspider(request):
    data = {
        'status': 'failure',
        'guid': '0',
        'info': '',
    }

    try:
        # 正式环境用 post 请求
        url = request.POST.get('url')
        force = request.POST.get('force', 'false')
        pattern = re.compile('user-rate-')
        urls = re.split(pattern, url)
        user_id = urls[1]
        pattern = re.compile('\w+', re.S)
        user_id = re.search(pattern, user_id).group()
        sql = SqlHelper()

        utils.log('user_id:%s' % user_id)

        if 'rate.taobao.com' in url and user_id != None:
            data['status'] = 'success'
            data['guid'] = str(random.randint(1000000000000,
                                              9999999999999)) + '_' + str(
                                                  random.randint(100, 999))
            data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现',

            command = "SELECT id FROM {table} WHERE id={user_id}". \
                format(table = config.tb_item_table, user_id = user_id)
            result = sql.query_one(command)

            if result == None:
                name = 'tb_comment'
                cmd = 'python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
                      '-a user_id={user_id} -a url={url};'. \
                    format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'),
                           user_id = user_id)

                logging.warn(cmd)
                subprocess.Popen(cmd, shell=True)
            else:
                if force == 'false':
                    utils.log('数据库中存在数据,从数据库中取出分析结果')
                    command = "SELECT * FROM {0} WHERE user_id={1} ORDER BY id". \
                        format(config.analysis_item_table, user_id)
                    result = sql.query(command)
                    for res in result:
                        utils.push_redis(data.get('guid'),
                                         res[1],
                                         res[2],
                                         res[3],
                                         save_to_mysql=False)
                else:
                    command = "DELETE FROM {0} WHERE produce_id={1}".format(
                        config.analysis_item_table, user_id)
                    sql.execute(command)
                    #重新分析数据
                    cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \
                          'user_id={user_id};'. \
                        format(url = url, name = 'tb', dir = settings.BASE_DIR, guid = data.get('guid'),
                               user_id = user_id)

                    subprocess.Popen(cmd, shell=True)
        else:
            data[
                'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://rate.taobao.com/user-rate-UvGv0MFc0vFILvgTT.htm'
    except Exception, e:
        logging.error('run spider exception:%s' % e)
        data['info'] = '出现错误,错误原因:%s' % e
コード例 #9
0
ファイル: recipe.py プロジェクト: IamSpicyBar/spiderRecipes
class Recipe(CrawlSpider):
    name = "user_recipes"

    base_url = 'https://www.xiachufang.com'
    
    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Connection': 'keep-alive',
        'Host': 'www.xiachufang.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
    }

    def __init__(self, *a , **kw):
        super(Recipe, self).__init__(*a, **kw)

        self.dir_name = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()
        utils.make_dir(self.dir_name)

    def init(self):
        command = (
            "CREATE TABLE IF NOT EXISTS {} ("
            "`id` INT(8) NOT NULL AUTO_INCREMENT,"
            "`name` CHAR(20) NOT NULL COMMENT 'recipe name',"
            "`url` TEXT NOT NULL COMMENT 'recipe url',"
            "`item_id` INT(8) NOT NULL COMMENT 'recipe ID',"
            "`user_id` INT(12) NOT NULL COMMENT 'user ID',"
            "`create_time` DATETIME NOT NULL,"
            "PRIMARY KEY(id),"
            "UNIQUE KEY `item_id` (`item_id`)"
            ") ENGINE=InnoDB".format(config.item_list_table)
        )

        self.sql.create_table(command)



    def start_requests(self):
        command = "SELECT * from {}".format(config.users_urls_table)
        data = self.sql.query(command)

        for i, user in enumerate(data):
            if i > 200:
                page = 1
                url = self.base_url + user[3] + 'created/?page=%d' % page
                utils.log(url)
                yield Request(
                    url = url,
                    headers = self.header,
                    meta = {"page":page, "user_id":user[2], "user_url":user[3]},
                    callback = self.parse_all,
                    errback = self.error_parse,
                )




    def parse_all(self, response):
        utils.log(response.url)
        if response.status == 200:
            file_name = '%s/user.html' % (self.dir_name)
            self.save_page(file_name, response.body)
            recipes = response.xpath("//div[@class='recipes-280-full-width-list']/ul/li").extract()

            page =  response.meta["page"]
            u_url = response.meta["user_url"]
            u_id = response.meta["user_id"]
            
            for recipe in recipes:
                sel = Selector(text = recipe)
                
                name = sel.xpath("//p[@class='name ellipsis red-font']/a/text()").extract_first().strip()
                url = sel.xpath("//p[@class='name ellipsis red-font']/a/@href").extract_first()
                item_id = url.split('/')[-2]
                dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                msg = (None, name, url, item_id, u_id, dt)
                command = ("INSERT IGNORE INTO {} "
                            "(id, name, url, item_id, user_id, create_time)"
                            "VALUES(%s,%s,%s,%s,%s,%s)".format(config.item_list_table)
                )
                self.sql.insert_data(command, msg)
                
            page += 1
            if page < 3:
                
                yield Request(
                    url = self.base_url + u_url + 'created/?page=%d' % page,
                    meta = {"page":page, "user_id":u_id, "user_url":u_url},
                    headers = self.header,
                    callback = self.parse_all,
                    errback = self.error_parse,
                    dont_filter = True,
                )   

    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))


    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()
コード例 #10
0
ファイル: getdata.py プロジェクト: IamSpicyBar/spiderRecipes
    csvFile3 = open('recipe_name.csv','w')
    csvFile4 = open('source_name.csv','w')
    
    writer1 = csv.writer(csvFile1)
    writer1.writerow(['user_id', 'item_id'])
    writer2 = csv.writer(csvFile2)
    writer2.writerow(['recipe_id', 'source_id'])
    writer3 = csv.writer(csvFile3)
    writer3.writerow(['recipe_id', 'recipe_name'])
    writer4 = csv.writer(csvFile4)
    writer4.writerow(['source_id', 'source_name'])
    
    sql = SqlHelper()

    command = "SELECT user_id, item_id FROM {}".format(config.item_list_table)
    data = sql.query(command)

    for i, recipe in enumerate(data):
        writer1.writerow([recipe[0], recipe[1]])

    csvFile1.close()
        
    command = "SELECT item_id, name FROM {}".format(config.item_list_table)
    data1 = sql.query(command)

    for i, item in enumerate(data1):
 #       name_en = translator.translate(item[1]).text
        writer3.writerow([item[0], item[1].encode('utf8')])
    
    csvFile3.close()
  
コード例 #11
0
class GameInfo(CrawlSpider):
    name = 'game_info'

    def __init__(self, *a, **kw):
        super(GameInfo, self).__init__(*a, **kw)

        self.dir_game = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()

        utils.make_dir(self.dir_game)

        self.error_count = 0

    def init(self):
        command = ("CREATE TABLE IF NOT EXISTS {} ("
                   "`id` INT(8) NOT NULL AUTO_INCREMENT,"
                   "`name` TEXT NOT NULL,"
                   "`price` INT(5) NOT NULL,"
                   "`metacritic_score` FLOAT DEFAULT NULL,"
                   "`user_reviews_count` INT(6) NOT NULL,"
                   "`positive_user_reviews_count` INT(6) NOT NULL,"
                   "`positive_percent` FLOAT NOT NULL ,"
                   "`negative_user_reviews_count` INT(6) NOT NULL,"
                   '`steam_user_reviews_count` INT(6) NOT NULL,'
                   '`non_steam_user_reviews_count` INT(6) NOT NULL,'
                   '`english_user_reviews_count` INT(6) NOT NULL,'
                   '`non_english_user_reviews_count` INT(6) NOT NULL,'
                   "`tag_list` TEXT DEFAULT NULL,"
                   "`achievements_count` INT(4) DEFAULT NULL,"
                   "`category` TEXT NOT NULL,"
                   "`genre` TEXT NOT NULL,"
                   "`developer` TEXT NOT NULL,"
                   "`publisher` TEXT NOT NULL,"
                   "`release_date` TEXT NOT NULL,"
                   "`url` TEXT NOT NULL,"
                   "`language_number` INT(3) DEFAULT NULL,"
                   "`description` TEXT DEFAULT NULL,"
                   "`save_time` TIMESTAMP NOT NULL,"
                   "PRIMARY KEY(id)"
                   ") ENGINE=InnoDB".format(config.steam_game_info_table))
        self.sql.create_table(command)

    def start_requests(self):
        command = "SELECT * FROM {} WHERE is_crawled = \'no\' AND type = \'app\'".format(
            config.steam_game_urls_table)
        data = self.sql.query(command)
        for i, item in enumerate(data):
            yield Request(
                url=item[3],
                dont_filter=True,
                method='GET',
                headers={
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Encoding':
                    'gzip, deflate',
                    'Accept-Language':
                    'en-US,en;q=0.5',
                    'Connection':
                    'keep-alive',
                    'Host':
                    'store.steampowered.com',
                    'Upgrade-Insecure-Requests':
                    '1',
                    'User-Agent':
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 '
                    'Firefox/51.0',
                },
                meta={
                    'item': item,
                    'id': item[0],
                },
                cookies={
                    'mature_content': '1',
                },
                callback=self.parse_game,
                errback=self.error_parse,
            )

    def parse_game(self, response):
        self.log('parse_game url:%s' % response.url)
        id = response.meta.get('id')

        # file_name = '%s/%s.html' % (self.dir_game, id)
        # self.save_page(file_name, response.body)

        if u'Please enter your birth date to continue' in response.body:
            self.log('Please enter your birth date to continue meta:%s' %
                     response.meta)

            url = 'http://store.steampowered.com/agecheck/app/%s/' % str(id)
            return FormRequest(url=url,
                               dont_filter=True,
                               method='POST',
                               formdata={
                                   'ageDay': str(range(1, 25)),
                                   'ageMonth': 'January',
                                   'ageYear': str(range(1980, 1995)),
                                   'snr': '1_agecheck_agecheck__age-gate',
                               },
                               callback=self.parse_game)

        soup = BeautifulSoup(response.body, 'lxml')
        sel = Selector(text=response.body)

        name = sel.xpath(
            '//div[@class="apphub_AppName"]/text()').extract_first()
        if name == '' or name == None:
            self.log('no get data meta:%s' % response.meta)
            return

        price = sel.xpath('//div[@class="game_purchase_price price"]/text()'
                          ).extract_first()
        try:
            p = price.split('¥')
            price = int(p[1])
        except:
            price = -1

        # 该游戏在 metacritic 上的评分
        metacritic_score = sel.xpath(
            '//div[@class="score high"]/text()').extract_first()
        try:
            metacritic_score = int(metacritic_score)
        except:
            metacritic_score = -1

        # 所有用户回复数量
        user_reviews_count = sel.xpath(
            '//label[@for="review_type_all"]/span/text()').extract_first()
        user_reviews_count = self.count_to_int(user_reviews_count)

        # 好评的用户数量
        positive_user_reviews_count = sel.xpath(
            '//label[@for="review_type_positive"]/span/text()').extract_first(
            )
        positive_user_reviews_count = self.count_to_int(
            positive_user_reviews_count)

        # 好评的百分比
        if user_reviews_count != -1 and positive_user_reviews_count != -1:
            positive_percent = positive_user_reviews_count * 1.0 / user_reviews_count * 100
        else:
            positive_percent = 0

        # 差评的用户数量
        negative_user_reviews_count = sel.xpath(
            '//label[@for="review_type_negative"]/span/text()').extract_first(
            )
        negative_user_reviews_count = self.count_to_int(
            negative_user_reviews_count)

        # 在 steam 购买的用户的评论数
        steam_user_reviews_count = sel.xpath(
            '//label[@for="purchase_type_steam"]/span/text()').extract_first()
        steam_user_reviews_count = self.count_to_int(steam_user_reviews_count)

        # 在其他平台购买的用户的评论数
        non_steam_user_reviews_count = sel.xpath(
            '//label[@for="purchase_type_non_steam"]/span/text()'
        ).extract_first()
        non_steam_user_reviews_count = self.count_to_int(
            non_steam_user_reviews_count)

        # 英语评论的数量
        english_user_reviews_count = sel.xpath(
            '//label[@for="review_language_mine"]/span/text()').extract_first(
            )
        english_user_reviews_count = self.count_to_int(
            english_user_reviews_count)

        # 非英语的评论数量
        non_english_user_reviews_count = user_reviews_count - english_user_reviews_count

        # 该游戏的标签列表
        try:
            tags = soup.find(attrs={'class': 'glance_tags popular_tags'})
            tag_list = tags.text.replace('\t', '')
            tag_list = tag_list.replace('\n', ',')
        except:
            tag_list = ''

        # 该游戏的成就数量
        achievements = sel.xpath(
            '//div[@id="achievement_block"]/div/text()').extract_first()
        try:
            achievements_count = re.search('\d+', achievements, re.S).group(0)
            achievements_count = int(achievements_count)
        except:
            achievements_count = 0

        # 该游戏的分类 All Games > Action Games > Counter-Strike
        try:
            category = soup.find(name='div', attrs={
                'class': 'breadcrumbs'
            }).text
            category = category.replace('\t', '')
            category = category.replace('\n', '')
        except:
            category = ''

        # 游戏类型
        genre = sel.xpath(
            '//div[@class="block_content"]/div/div/a/text()').extract_first()

        # 游戏开发商
        developer = sel.xpath(
            '//div[@class="block_content"]/div/div/a[2]/text()').extract_first(
            )

        # 游戏发行商
        publisher = sel.xpath(
            '//div[@class="block_content"]/div/div/a[3]/text()').extract_first(
            )

        # 游戏发行日期
        release_date = sel.xpath(
            '//div[@class="release_date"]/span/text()').extract_first()

        # 游戏支持的语言
        language_number = len(
            sel.xpath(
                '//table[@class="game_language_options"]/tr').extract()) - 1

        # 游戏描述
        description = sel.xpath(
            '//div[@class="game_description_snippet"]/text()').extract_first()

        # 抓取该游戏时间
        save_time = None

        msg = (id, name, price, response.url, metacritic_score,
               user_reviews_count, positive_user_reviews_count,
               positive_percent, negative_user_reviews_count,
               steam_user_reviews_count, non_steam_user_reviews_count,
               english_user_reviews_count, non_english_user_reviews_count,
               tag_list, achievements_count, category, genre, developer,
               publisher, release_date, language_number, description,
               save_time)

        command = (
            "INSERT IGNORE INTO {} "
            "(id, name, price, url, metacritic_score, user_reviews_count, positive_user_reviews_count, "
            "positive_percent, negative_user_reviews_count, steam_user_reviews_count, "
            "non_steam_user_reviews_count, english_user_reviews_count, non_english_user_reviews_count, "
            "tag_list, achievements_count, category, genre, developer, publisher, release_date, "
            "language_number, description, save_time)"
            "VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
            "%s)".format(config.steam_game_info_table))

        self.sql.insert_data(command, msg)

        command = "UPDATE {0} SET is_crawled=\'yes\' WHERE id=\'{1}\'".format(
            config.steam_game_urls_table, id)
        self.sql.execute(command)

    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))

    def get_id(self, url):
        type = ''
        if '/sub/' in url:
            pattern = re.compile('/sub/(\d+)/')
            type = 'sub'
        elif '/app/' in url:
            pattern = re.compile('/app/(\d+)/', re.S)
            type = 'app'
        elif '/bundle/' in url:
            pattern = re.compile('/bundle/(\d+)/', re.S)
            type = 'bundle'
        else:
            pattern = re.compile('/(\d+)/', re.S)
            type = 'other'
            utils.log('get_id other url:%s' % url)

        id = re.search(pattern, url)
        if id:
            id = id.group(1)
            return id

        self.error_count = self.error_count + 1
        utils.log('get_id error url:%s' % url)
        return -self.error_count

    def count_to_int(self, data):
        try:
            ret = data
            ret = ret.replace('(', '')
            ret = ret.replace(')', '')
            ret = ret.replace(',', '')

            return int(ret)
        except:
            return -1

    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()