Ejemplo n.º 1
0
class GameUrls(Spider):
    name = 'game_urls'

    start_urls = [
        'http://store.steampowered.com/search/?sort_by=Released_DESC&page=%s' %
        n for n in range(1, 1058)
    ]

    def __init__(self, *a, **kw):
        super(GameUrls, self).__init__(*a, **kw)

        self.dir_game = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()

        utils.make_dir(self.dir_game)

    def init(self):
        command = ("CREATE TABLE IF NOT EXISTS {} ("
                   "`id` INT(8) NOT NULL AUTO_INCREMENT,"
                   "`type` CHAR(10) NOT NULL,"
                   "`name` TEXT NOT NULL,"
                   "`url` TEXT NOT NULL,"
                   "`is_crawled` CHAR(5) DEFAULT 'no',"
                   "`page` INT(5) NOT NULL ,"
                   "PRIMARY KEY(id)"
                   ") ENGINE=InnoDB".format(config.steam_game_urls_table))
        self.sql.create_table(command)

    def start_requests(self):
        for i, url in enumerate(self.start_urls):
            yield Request(
                url=url,
                headers={
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Encoding':
                    'gzip, deflate',
                    'Accept-Language':
                    'en-US,en;q=0.5',
                    'Connection':
                    'keep-alive',
                    'Host':
                    'store.steampowered.com',
                    'Upgrade-Insecure-Requests':
                    '1',
                    'User-Agent':
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 '
                    'Firefox/51.0',
                },
                meta={
                    'url': url,
                    'page': i + 1,
                },
                dont_filter=True,
                callback=self.parse_all,
                errback=self.error_parse,
            )

    def parse_all(self, response):
        # file_name = '%s/%s.html' % (self.dir_game, response.meta.get('page'))
        # self.save_page(file_name, response.body)

        self.log('parse_all url:%s' % response.url)

        game_list = response.xpath(
            '//div[@id="search_result_container"]/div[2]/a').extract()
        count = 0
        for game in game_list:
            sel = Selector(text=game)
            url = sel.xpath('//@href').extract_first()

            id, type = self.get_id(url)
            # id = sel.xpath('//@data-ds-appid').extract_first()
            name = sel.xpath(
                '//div[@class="col search_name ellipsis"]/span/text()'
            ).extract_first()

            msg = (None, type, name, url, 'no', response.meta.get('page'))
            command = ("INSERT IGNORE INTO {} "
                       "(id, type, name, url, is_crawled, page)"
                       "VALUES(%s, %s, %s, %s, %s, %s)".format(
                           config.steam_game_urls_table))

            self.sql.insert_data(command, msg)

    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))

    def get_id(self, url):
        type = ''
        if '/sub/' in url:
            pattern = re.compile('/sub/(\d+)/')
            type = 'sub'
        elif '/app/' in url:
            pattern = re.compile('/app/(\d+)/', re.S)
            type = 'app'
        elif '/bundle/' in url:
            pattern = re.compile('/bundle/(\d+)/', re.S)
            type = 'bundle'
        else:
            pattern = re.compile('/(\d+)/', re.S)
            type = 'other'
            utils.log('get_id other url:%s' % url)

        id = re.search(pattern, url)
        if id:
            id = id.group(1)
            return id, type

        utils.log('get_id error url:%s' % url)
        return 0, 'error'

    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()
Ejemplo n.º 2
0
class Validator(Spider):
    name = 'base'
    concurrent_requests = 16
    retry_enabled = False

    def __init__(self, name=None, **kwargs):
        super(Validator, self).__init__(name, **kwargs)
        self.sql = SqlHelper()

        self.dir_log = 'log/validator/%s' % self.name
        self.timeout = 10

        self.urls = []
        self.headers = None
        self.success_mark = ''
        self.is_record_web_page = False

    def init(self):
        utils.make_dir(self.dir_log)

        command = utils.get_create_table_command(self.name)
        self.sql.create_table(command)

    @classmethod
    def update_settings(cls, settings):
        settings.setdict(cls.custom_settings or {
            'CONCURRENT_REQUESTS': cls.concurrent_requests,
            'RETRY_ENABLED': cls.retry_enabled,
        },
                         priority='spider')

    def start_requests(self):
        count = utils.get_table_length(self.sql, self.name)
        count_free = utils.get_table_length(self.sql, config.httpbin_table)

        ids = utils.get_table_ids(self.sql, self.name)
        ids_free = utils.get_table_ids(self.sql, config.httpbin_table)

        for i in range(0, count + count_free):
            table = self.name if (i < count) else config.httpbin_table
            id = ids[i] if i < count else ids_free[i - len(ids)]

            proxy = utils.get_proxy_info(self.sql, table, id)
            if proxy == None:
                continue

            for url in self.urls:
                cur_time = time.time()
                yield Request(
                    url=url,
                    headers=self.headers,
                    meta={
                        'cur_time':
                        cur_time,
                        'download_timeout':
                        self.timeout,
                        'proxy_info':
                        proxy,
                        'table':
                        table,
                        'id':
                        proxy.get('id'),
                        'proxy':
                        'http://%s:%s' % (proxy.get('ip'), proxy.get('port')),
                        'vali_count':
                        proxy.get('vali_count', 0)
                    },
                    dont_filter=True,
                    callback=self.success_parse,
                    errback=self.error_parse,
                )

    def success_parse(self, response):
        utils.log('success_parse speed:%s meta:%s' %
                  (time.time() - response.meta.get('cur_time'), response.meta))

        proxy = response.meta.get('proxy_info')
        table = response.meta.get('table')
        id = response.meta.get('id')
        ip = proxy.get('ip')

        self.save_page(ip, response.body)

        if self.success_mark in response.body or self.success_mark is '':
            speed = time.time() - response.meta.get('cur_time')
            if table == self.name:
                if speed > self.timeout:
                    command = utils.get_delete_data_command(table, id)
                    self.sql.execute(command)
                else:
                    vali_count = response.meta.get('vali_count', 0) + 1
                    command = utils.get_update_data_command(
                        table, id, speed, vali_count)
                    self.sql.execute(command)
            else:
                if speed < self.timeout:
                    command = utils.get_insert_data_command(self.name)
                    msg = (None, proxy.get('ip'), proxy.get('port'),
                           proxy.get('country'), proxy.get('anonymity'),
                           proxy.get('https'), speed, proxy.get('source'),
                           None, 1)

                    self.sql.insert_data(command, msg, commit=True)
        else:
            # 如果没有找到成功标示,说明这里返回信息有误,需要删除当前库的 ip
            if table == self.name:
                command = utils.get_delete_data_command(table, id)
                self.sql.execute(command)

    def error_parse(self, failure):
        request = failure.request
        utils.log('error_parse value:%s url:%s meta:%s' %
                  (failure.value, request.url, request.meta))

        proxy = failure.request.meta.get('proxy_info')
        table = failure.request.meta.get('table')
        id = failure.request.meta.get('id')

        if table == self.name:
            command = utils.get_delete_data_command(table, id)
            self.sql.execute(command)
        else:
            # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理
            pass

            #
            # request = failure.request.meta
            # utils.log('request meta:%s' % str(request))
            #
            # # log all errback failures,
            # # in case you want to do something special for some errors,
            # # you may need the failure's type
            # self.logger.error(repr(failure))
            #
            # #if isinstance(failure.value, HttpError):
            # if failure.check(HttpError):
            #     # you can get the response
            #     response = failure.value.response
            #     self.logger.error('HttpError on %s', response.url)
            #
            # #elif isinstance(failure.value, DNSLookupError):
            # elif failure.check(DNSLookupError):
            #     # this is the original request
            #     request = failure.request
            #     self.logger.error('DNSLookupError on %s', request.url)
            #
            # #elif isinstance(failure.value, TimeoutError):
            # elif failure.check(TimeoutError):
            #     request = failure.request
            #     self.logger.error('TimeoutError on url:%s', request.url)

    def save_page(self, ip, data):
        filename = '{time} {ip}'.format(
            time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'),
            ip=ip)
        utils.log('filename:%s' % filename)

        if self.is_record_web_page:
            with open('%s/%s.html' % (self.dir_log, filename), 'w') as f:
                f.write(data)
                f.close()

    def close(spider, reason):
        spider.sql.commit()
Ejemplo n.º 3
0
class RecipeDetail(CrawlSpider):
    name = "recipe_detail"

    base_url = 'https://www.xiachufang.com'
    
    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Connection': 'keep-alive',
        'Host': 'www.xiachufang.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
    }

    

    def __init__(self, *a , **kw):
        super(RecipeDetail, self).__init__(*a, **kw)

        self.dir_name = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()
        utils.make_dir(self.dir_name)


    def init(self):
        command = (
            "CREATE TABLE IF NOT EXISTS {} ("
            "`id` INT(8) NOT NULL AUTO_INCREMENT,"
            "`name` CHAR(20) NOT NULL COMMENT 'recipe name',"
            "`recipe_id` INT(12) NOT NULL COMMENT 'recipe ID',"
            "`source_name` CHAR(20) NOT NULL COMMENT 'source name',"
            "`source_id` INT(8) NOT NULL COMMENT 'source ID',"
            "`create_time` DATETIME NOT NULL,"
            "PRIMARY KEY(id)"
            ") ENGINE=InnoDB".format(config.item_detail_table)
        )

        self.sql.create_table(command)

    def start_requests(self):
        command = "SELECT * from {}".format(config.item_list_table)
        data = self.sql.query(command)

        for i, recipe in enumerate(data):
            if recipe[0] > 8999 and recipe[0] < 10000:
                url = self.base_url + recipe[2]
                utils.log(url)
                yield Request(
                    url = url,
                    headers = self.header,
                    callback = self.parse_all,
                    errback = self.error_parse,
                    meta={"re_id": recipe[3], "re_name": recipe[1]},
                    dont_filter = True,
                )


    def parse_all(self, response):
        utils.log(response.url)
        if response.status == 429:
            raise CloseSpider('Too much request, IP banned')
        if response.status == 200:
            file_name = '%s/recipe.html' % (self.dir_name)
            self.save_page(file_name, response.body)
            sources = response.xpath("//div[@class='ings']//tr").extract()

            for source in sources:
                sel = Selector(text = source)
                
                source_name = sel.xpath("//a/text()").extract_first()
                url = sel.xpath("//a/@href").extract_first()
                if source_name is not None and url is not None:
                    source_id = url.split('/')[-2]
                    r_name = response.meta["re_name"]
                    r_id = response.meta["re_id"]
                    dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    msg = (None, r_name, r_id, source_name, source_id, dt)
                    command = ("INSERT IGNORE INTO {} "
                                "(id, name, recipe_id, source_name, source_id, create_time)"
                                "VALUES(%s,%s,%s,%s,%s,%s)".format(config.item_detail_table)
                    )
                    self.sql.insert_data(command, msg)


    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))


    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()
Ejemplo n.º 4
0
#-*- coding: utf-8 -*-

import config

from scrapy import Selector
from bs4 import BeautifulSoup
from sqlhelper import SqlHelper

sql = SqlHelper()

command = ("CREATE TABLE IF NOT EXISTS {} ("
           "`id` INT(10) NOT NULL UNIQUE,"
           "`name` CHAR(10) NOT NULL,"
           "PRIMARY KEY(name)"
           ") ENGINE=InnoDB".format(config.boss_city_id_table))
sql.create_table(command)

with open('spider/boss.html', 'r') as f:
    text = f.read()
    f.close()

sel = Selector(text=text)

soup = BeautifulSoup(text, 'lxml')
s = soup.find(name='div', attrs={'class': 'dorpdown-city'})

lis = s.find_all('li')
for li in lis:
    print('li data-val:%s  text:%s' % (li.attrs.get('data-val'), li.text))

    msg = (li.attrs.get('data-val'), li.text)
Ejemplo n.º 5
0
class User(Spider):
    name = "user_urls"

    start_url = 'https://www.xiachufang.com'

    header = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate, sdch',
        'Accept-Language':
        'zh-CN,zh;q=0.8,en;q=0.6',
        'Connection':
        'keep-alive',
        'Host':
        'www.xiachufang.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
    }

    def __init__(self, *a, **kw):
        super(User, self).__init__(*a, **kw)

        self.dir_name = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()
        utils.make_dir(self.dir_name)

    def init(self):
        command = ("CREATE TABLE IF NOT EXISTS {} ("
                   "`id` INT(8) NOT NULL AUTO_INCREMENT,"
                   "`name` CHAR(20) NOT NULL COMMENT 'user_name',"
                   "`user_id` INT(12) NOT NULL COMMENT 'user_ID',"
                   "`url` TEXT NOT NULL COMMENT 'user_url',"
                   "`create_time` DATETIME NOT NULL,"
                   "PRIMARY KEY(id),"
                   "UNIQUE KEY `user_id` (`user_id`)"
                   ") ENGINE=InnoDB".format(config.users_urls_table))

        self.sql.create_table(command)

    def start_requests(self):
        active_url = '/feature/cook/active/'
        url = self.start_url + active_url
        N = 5

        for index in range(1, N):
            yield Request(
                url=url,
                headers=self.header,
                callback=self.parse_all,
                errback=self.error_parse,
                dont_filter=True,
            )

    def parse_all(self, response):
        if response.status == 200:
            file_name = '%s/users.html' % (self.dir_name)
            self.save_page(file_name, response.body)
            users = response.xpath("//div[@class='content']/ul/li").extract()
            for user in users:
                sel = Selector(text=user)

                url = sel.xpath("//div[@class='name']/a/@href").extract_first()
                user_id = url.split('/')[-2]
                name = sel.xpath(
                    "//div[@class='name']/a/text()").extract_first()
                dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                msg = (None, name, user_id, url, dt)
                command = ("INSERT IGNORE INTO {} "
                           "(id, name, user_id, url, create_time)"
                           "VALUES(%s,%s,%s,%s,%s)".format(
                               config.users_urls_table))
                self.sql.insert_data(command, msg)

    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))

    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()
Ejemplo n.º 6
0
class ItemDetail(CrawlSpider):
    name = "item_list"
    base_url = "http://www.xiachufang.com"
    header = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate, sdch',
        'Accept-Language':
        'zh-CN,zh;q=0.8,en;q=0.6',
        'Connection':
        'keep-alive',
        'Host':
        'www.xiachufang.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
    }

    def __init__(self, *a, **kw):
        super(ItemDetail, self).__init__(*a, **kw)

        self.dir_name = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()
        utils.make_dir(self.dir_name)

    def init(self):
        command = ("CREATE TABLE IF NOT EXISTS {} ("
                   "`id` INT(8) NOT NULL AUTO_INCREMENT,"
                   "`name` CHAR(20) NOT NULL COMMENT '菜肴名称',"
                   "`url` TEXT NOT NULL COMMENT '菜肴url',"
                   "`img` TEXT NOT NULL COMMENT '封面图',"
                   "`item_id` INT(8) NOT NULL COMMENT '菜肴id',"
                   "`source` TEXT NOT NULL COMMENT '原料',"
                   "`score` CHAR(5) NOT NULL COMMENT '平分',"
                   "`create_time` DATETIME NOT NULL,"
                   "PRIMARY KEY(id),"
                   "UNIQUE KEY `item_id` (`item_id`)"
                   ") ENGINE=InnoDB".format(config.item_list_table))

        self.sql.create_table(command)

    def start_requests(self):
        command = "SELECT * from {}".format(config.category_urls_table)
        data = self.sql.query(command)

        for i, category in enumerate(data):
            url = self.base_url + category[2]
            utils.log(url)
            yield Request(
                url=url,
                headers=self.header,
                callback=self.parse_all,
                errback=self.error_parse,
            )

    def parse_all(self, response):
        utils.log(response.url)
        if response.status == 200:
            file_name = '%s/category.html' % (self.dir_name)
            self.save_page(file_name, response.body)
            recipes = response.xpath(
                "//div[@class='normal-recipe-list']/ul/li").extract()
            self.parse_recipes(recipes)
            nextPage = response.xpath(
                "//div[@class='pager']/a[@class='next']/@href").extract_first(
                )
            if nextPage:
                yield Request(
                    url=self.base_url + nextPage,
                    headers=self.header,
                    callback=self.parse_all,
                    errback=self.error_parse,
                )

    def parse_recipes(self, recipes):
        for recipe in recipes:
            sel = Selector(text=recipe)
            name = sel.xpath(
                "//p[@class='name']/text()").extract_first().strip()
            url = sel.xpath("//a[1]/@href").extract_first()
            img = sel.xpath(
                "//div[@class='cover pure-u']/img/@data-src").extract_first()
            item_id = re.compile("/recipe/(.*?)/").findall(url)[0]
            source = sel.xpath(
                "//p[@class='ing ellipsis']/text()").extract_first().strip()
            score = sel.xpath(
                "//p[@class='stats']/span/text()").extract_first().strip()
            dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            msg = (None, name, url, img, item_id, source, score, dt)
            command = (
                "INSERT IGNORE INTO {} "
                "(id, name, url, img, item_id, source, score, create_time)"
                "VALUES(%s,%s,%s,%s,%s,%s,%s,%s)".format(
                    config.item_list_table))
            self.sql.insert_data(command, msg)

    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))

    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()
Ejemplo n.º 7
0
class Category(Spider):
    name = "category_urls"

    start_url = 'http://www.xiachufang.com/category/'

    def __init__(self, *a, **kw):
        super(Category, self).__init__(*a, **kw)

        self.dir_name = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()
        utils.make_dir(self.dir_name)

    def init(self):
        command = ("CREATE TABLE IF NOT EXISTS {} ("
                   "`id` INT(8) NOT NULL AUTO_INCREMENT,"
                   "`name` CHAR(20) NOT NULL COMMENT '分类名称',"
                   "`url` TEXT NOT NULL COMMENT '分类url',"
                   "`category` CHAR(20) NOT NULL COMMENT '父级分类',"
                   "`category_id` INT(8) NOT NULL COMMENT '分类id',"
                   "`create_time` DATETIME NOT NULL,"
                   "PRIMARY KEY(id),"
                   "UNIQUE KEY `category_id` (`category_id`)"
                   ") ENGINE=InnoDB".format(config.category_urls_table))

        self.sql.create_table(command)

    def start_requests(self):
        yield Request(
            url=self.start_url,
            headers={
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Encoding':
                'gzip, deflate, sdch',
                'Accept-Language':
                'zh-CN,zh;q=0.8,en;q=0.6',
                'Connection':
                'keep-alive',
                'Host':
                'www.xiachufang.com',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
            },
            callback=self.parse_all,
            errback=self.error_parse,
        )

    def parse_all(self, response):
        if response.status == 200:
            file_name = '%s/category.html' % (self.dir_name)
            self.save_page(file_name, response.body)
            categorys = response.xpath(
                "//div[@class='cates-list-all clearfix hidden']").extract()
            for category in categorys:
                sel_category = Selector(text=category)
                category_father = sel_category.xpath(
                    "//h4/text()").extract_first().strip()
                items = sel_category.xpath("//ul/li/a").extract()
                for item in items:
                    sel = Selector(text=item)
                    url = sel.xpath("//@href").extract_first()
                    name = sel.xpath("//text()").extract_first()
                    _id = re.compile('/category/(.*?)/').findall(url)[0]
                    dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    msg = (None, name, url, category_father, _id, dt)
                    command = (
                        "INSERT IGNORE INTO {} "
                        "(id, name, url, category, category_id, create_time)"
                        "VALUES(%s,%s,%s,%s,%s,%s)".format(
                            config.category_urls_table))
                    self.sql.insert_data(command, msg)

    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))

    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()
Ejemplo n.º 8
0
class Validator(Spider):
    name = 'base'

    def __init__(self, name=None, **kwargs):
        super(Validator, self).__init__(name, **kwargs)
        self.sql = SqlHelper()

        self.dir_log = 'log/validator/%s' % self.name
        self.timeout = 10

        self.urls = []
        self.headers = None
        self.success_mark = ''

    def init(self):
        utils.make_dir(self.dir_log)

        command = utils.get_create_table_command(self.name)
        self.sql.create_table(command)

    def start_requests(self):
        count = utils.get_table_length(self.sql, self.name)
        count_free = utils.get_table_length(self.sql, free_ipproxy_table)

        for i in range(0, count + count_free):
            table = self.name if (i < count) else free_ipproxy_table

            proxy = utils.get_proxy_info(self.sql, table, i)
            if proxy == None:
                continue

            for url in self.urls:
                cur_time = time.time()
                yield Request(
                    url=url,
                    headers=self.headers,
                    meta={
                        'cur_time':
                        cur_time,
                        'download_timeout':
                        self.timeout,
                        'proxy_info':
                        proxy,
                        'table':
                        table,
                        'id':
                        proxy.get('id'),
                        'proxy':
                        'http://%s:%s' % (proxy.get('ip'), proxy.get('port')),
                    },
                    dont_filter=True,
                    callback=self.success_parse,
                    errback=self.error_parse,
                )

    def success_parse(self, response):
        utils.log('name:%s success_parse proxy:%s meta:%s' %
                  (self.name, str(
                      response.meta.get('proxy_info')), str(response.meta)))

        filename = datetime.datetime.now().strftime('%Y-%m-%d %H_%M_%S_%f')
        self.save_page(filename, response.body)

        if response.body.find(self.success_mark) or self.success_mark is '':
            proxy = response.meta.get('proxy_info')
            speed = time.time() - response.meta.get('cur_time')
            table = response.meta.get('table')
            id = response.meta.get('id')

            utils.log('speed:%s table:%s id:%s' % (speed, table, id))

            if table == self.name:
                if speed > self.timeout:
                    command = utils.get_delete_data_command(table, id)
                    self.sql.execute(command)
                else:
                    command = utils.get_update_data_command(table, id, speed)
                    self.sql.execute(command)
            else:
                if speed < self.timeout:
                    command = utils.get_insert_data_command(self.name)
                    msg = (None, proxy.get('ip'), proxy.get('port'),
                           proxy.get('country'), proxy.get('anonymity'),
                           proxy.get('https'), speed, proxy.get('source'),
                           None)

                    self.sql.insert_data(command, msg)

    def error_parse(self, failure):
        utils.log('error_parse value:%s' % failure.value)

        proxy = failure.request.meta.get('proxy_info')
        table = failure.request.meta.get('table')
        id = failure.request.meta.get('id')

        if table == self.name:
            command = utils.get_delete_data_command(table, id)
            self.sql.execute(command)
        else:
            # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理
            pass

            #
            # request = failure.request.meta
            # utils.log('request meta:%s' % str(request))
            #
            # # log all errback failures,
            # # in case you want to do something special for some errors,
            # # you may need the failure's type
            # self.logger.error(repr(failure))
            #
            # #if isinstance(failure.value, HttpError):
            # if failure.check(HttpError):
            #     # you can get the response
            #     response = failure.value.response
            #     self.logger.error('HttpError on %s', response.url)
            #
            # #elif isinstance(failure.value, DNSLookupError):
            # elif failure.check(DNSLookupError):
            #     # this is the original request
            #     request = failure.request
            #     self.logger.error('DNSLookupError on %s', request.url)
            #
            # #elif isinstance(failure.value, TimeoutError):
            # elif failure.check(TimeoutError):
            #     request = failure.request
            #     self.logger.error('TimeoutError on url:%s', request.url)

    def save_page(self, filename, data):
        if get_project_settings().get('IS_RECODE_HTML', False):
            with open('%s/%s.html' % (self.dir_log, filename), 'w') as f:
                f.write(data)
                f.close()
Ejemplo n.º 9
0
class Recipe(CrawlSpider):
    name = "user_recipes"

    base_url = 'https://www.xiachufang.com'
    
    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Connection': 'keep-alive',
        'Host': 'www.xiachufang.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
    }

    def __init__(self, *a , **kw):
        super(Recipe, self).__init__(*a, **kw)

        self.dir_name = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()
        utils.make_dir(self.dir_name)

    def init(self):
        command = (
            "CREATE TABLE IF NOT EXISTS {} ("
            "`id` INT(8) NOT NULL AUTO_INCREMENT,"
            "`name` CHAR(20) NOT NULL COMMENT 'recipe name',"
            "`url` TEXT NOT NULL COMMENT 'recipe url',"
            "`item_id` INT(8) NOT NULL COMMENT 'recipe ID',"
            "`user_id` INT(12) NOT NULL COMMENT 'user ID',"
            "`create_time` DATETIME NOT NULL,"
            "PRIMARY KEY(id),"
            "UNIQUE KEY `item_id` (`item_id`)"
            ") ENGINE=InnoDB".format(config.item_list_table)
        )

        self.sql.create_table(command)



    def start_requests(self):
        command = "SELECT * from {}".format(config.users_urls_table)
        data = self.sql.query(command)

        for i, user in enumerate(data):
            if i > 200:
                page = 1
                url = self.base_url + user[3] + 'created/?page=%d' % page
                utils.log(url)
                yield Request(
                    url = url,
                    headers = self.header,
                    meta = {"page":page, "user_id":user[2], "user_url":user[3]},
                    callback = self.parse_all,
                    errback = self.error_parse,
                )




    def parse_all(self, response):
        utils.log(response.url)
        if response.status == 200:
            file_name = '%s/user.html' % (self.dir_name)
            self.save_page(file_name, response.body)
            recipes = response.xpath("//div[@class='recipes-280-full-width-list']/ul/li").extract()

            page =  response.meta["page"]
            u_url = response.meta["user_url"]
            u_id = response.meta["user_id"]
            
            for recipe in recipes:
                sel = Selector(text = recipe)
                
                name = sel.xpath("//p[@class='name ellipsis red-font']/a/text()").extract_first().strip()
                url = sel.xpath("//p[@class='name ellipsis red-font']/a/@href").extract_first()
                item_id = url.split('/')[-2]
                dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                msg = (None, name, url, item_id, u_id, dt)
                command = ("INSERT IGNORE INTO {} "
                            "(id, name, url, item_id, user_id, create_time)"
                            "VALUES(%s,%s,%s,%s,%s,%s)".format(config.item_list_table)
                )
                self.sql.insert_data(command, msg)
                
            page += 1
            if page < 3:
                
                yield Request(
                    url = self.base_url + u_url + 'created/?page=%d' % page,
                    meta = {"page":page, "user_id":u_id, "user_url":u_url},
                    headers = self.header,
                    callback = self.parse_all,
                    errback = self.error_parse,
                    dont_filter = True,
                )   

    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))


    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()
Ejemplo n.º 10
0
class GameInfo(CrawlSpider):
    name = 'game_info'

    def __init__(self, *a, **kw):
        super(GameInfo, self).__init__(*a, **kw)

        self.dir_game = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()

        utils.make_dir(self.dir_game)

        self.error_count = 0

    def init(self):
        command = ("CREATE TABLE IF NOT EXISTS {} ("
                   "`id` INT(8) NOT NULL AUTO_INCREMENT,"
                   "`name` TEXT NOT NULL,"
                   "`price` INT(5) NOT NULL,"
                   "`metacritic_score` FLOAT DEFAULT NULL,"
                   "`user_reviews_count` INT(6) NOT NULL,"
                   "`positive_user_reviews_count` INT(6) NOT NULL,"
                   "`positive_percent` FLOAT NOT NULL ,"
                   "`negative_user_reviews_count` INT(6) NOT NULL,"
                   '`steam_user_reviews_count` INT(6) NOT NULL,'
                   '`non_steam_user_reviews_count` INT(6) NOT NULL,'
                   '`english_user_reviews_count` INT(6) NOT NULL,'
                   '`non_english_user_reviews_count` INT(6) NOT NULL,'
                   "`tag_list` TEXT DEFAULT NULL,"
                   "`achievements_count` INT(4) DEFAULT NULL,"
                   "`category` TEXT NOT NULL,"
                   "`genre` TEXT NOT NULL,"
                   "`developer` TEXT NOT NULL,"
                   "`publisher` TEXT NOT NULL,"
                   "`release_date` TEXT NOT NULL,"
                   "`url` TEXT NOT NULL,"
                   "`language_number` INT(3) DEFAULT NULL,"
                   "`description` TEXT DEFAULT NULL,"
                   "`save_time` TIMESTAMP NOT NULL,"
                   "PRIMARY KEY(id)"
                   ") ENGINE=InnoDB".format(config.steam_game_info_table))
        self.sql.create_table(command)

    def start_requests(self):
        command = "SELECT * FROM {} WHERE is_crawled = \'no\' AND type = \'app\'".format(
            config.steam_game_urls_table)
        data = self.sql.query(command)
        for i, item in enumerate(data):
            yield Request(
                url=item[3],
                dont_filter=True,
                method='GET',
                headers={
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Encoding':
                    'gzip, deflate',
                    'Accept-Language':
                    'en-US,en;q=0.5',
                    'Connection':
                    'keep-alive',
                    'Host':
                    'store.steampowered.com',
                    'Upgrade-Insecure-Requests':
                    '1',
                    'User-Agent':
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 '
                    'Firefox/51.0',
                },
                meta={
                    'item': item,
                    'id': item[0],
                },
                cookies={
                    'mature_content': '1',
                },
                callback=self.parse_game,
                errback=self.error_parse,
            )

    def parse_game(self, response):
        self.log('parse_game url:%s' % response.url)
        id = response.meta.get('id')

        # file_name = '%s/%s.html' % (self.dir_game, id)
        # self.save_page(file_name, response.body)

        if u'Please enter your birth date to continue' in response.body:
            self.log('Please enter your birth date to continue meta:%s' %
                     response.meta)

            url = 'http://store.steampowered.com/agecheck/app/%s/' % str(id)
            return FormRequest(url=url,
                               dont_filter=True,
                               method='POST',
                               formdata={
                                   'ageDay': str(range(1, 25)),
                                   'ageMonth': 'January',
                                   'ageYear': str(range(1980, 1995)),
                                   'snr': '1_agecheck_agecheck__age-gate',
                               },
                               callback=self.parse_game)

        soup = BeautifulSoup(response.body, 'lxml')
        sel = Selector(text=response.body)

        name = sel.xpath(
            '//div[@class="apphub_AppName"]/text()').extract_first()
        if name == '' or name == None:
            self.log('no get data meta:%s' % response.meta)
            return

        price = sel.xpath('//div[@class="game_purchase_price price"]/text()'
                          ).extract_first()
        try:
            p = price.split('¥')
            price = int(p[1])
        except:
            price = -1

        # 该游戏在 metacritic 上的评分
        metacritic_score = sel.xpath(
            '//div[@class="score high"]/text()').extract_first()
        try:
            metacritic_score = int(metacritic_score)
        except:
            metacritic_score = -1

        # 所有用户回复数量
        user_reviews_count = sel.xpath(
            '//label[@for="review_type_all"]/span/text()').extract_first()
        user_reviews_count = self.count_to_int(user_reviews_count)

        # 好评的用户数量
        positive_user_reviews_count = sel.xpath(
            '//label[@for="review_type_positive"]/span/text()').extract_first(
            )
        positive_user_reviews_count = self.count_to_int(
            positive_user_reviews_count)

        # 好评的百分比
        if user_reviews_count != -1 and positive_user_reviews_count != -1:
            positive_percent = positive_user_reviews_count * 1.0 / user_reviews_count * 100
        else:
            positive_percent = 0

        # 差评的用户数量
        negative_user_reviews_count = sel.xpath(
            '//label[@for="review_type_negative"]/span/text()').extract_first(
            )
        negative_user_reviews_count = self.count_to_int(
            negative_user_reviews_count)

        # 在 steam 购买的用户的评论数
        steam_user_reviews_count = sel.xpath(
            '//label[@for="purchase_type_steam"]/span/text()').extract_first()
        steam_user_reviews_count = self.count_to_int(steam_user_reviews_count)

        # 在其他平台购买的用户的评论数
        non_steam_user_reviews_count = sel.xpath(
            '//label[@for="purchase_type_non_steam"]/span/text()'
        ).extract_first()
        non_steam_user_reviews_count = self.count_to_int(
            non_steam_user_reviews_count)

        # 英语评论的数量
        english_user_reviews_count = sel.xpath(
            '//label[@for="review_language_mine"]/span/text()').extract_first(
            )
        english_user_reviews_count = self.count_to_int(
            english_user_reviews_count)

        # 非英语的评论数量
        non_english_user_reviews_count = user_reviews_count - english_user_reviews_count

        # 该游戏的标签列表
        try:
            tags = soup.find(attrs={'class': 'glance_tags popular_tags'})
            tag_list = tags.text.replace('\t', '')
            tag_list = tag_list.replace('\n', ',')
        except:
            tag_list = ''

        # 该游戏的成就数量
        achievements = sel.xpath(
            '//div[@id="achievement_block"]/div/text()').extract_first()
        try:
            achievements_count = re.search('\d+', achievements, re.S).group(0)
            achievements_count = int(achievements_count)
        except:
            achievements_count = 0

        # 该游戏的分类 All Games > Action Games > Counter-Strike
        try:
            category = soup.find(name='div', attrs={
                'class': 'breadcrumbs'
            }).text
            category = category.replace('\t', '')
            category = category.replace('\n', '')
        except:
            category = ''

        # 游戏类型
        genre = sel.xpath(
            '//div[@class="block_content"]/div/div/a/text()').extract_first()

        # 游戏开发商
        developer = sel.xpath(
            '//div[@class="block_content"]/div/div/a[2]/text()').extract_first(
            )

        # 游戏发行商
        publisher = sel.xpath(
            '//div[@class="block_content"]/div/div/a[3]/text()').extract_first(
            )

        # 游戏发行日期
        release_date = sel.xpath(
            '//div[@class="release_date"]/span/text()').extract_first()

        # 游戏支持的语言
        language_number = len(
            sel.xpath(
                '//table[@class="game_language_options"]/tr').extract()) - 1

        # 游戏描述
        description = sel.xpath(
            '//div[@class="game_description_snippet"]/text()').extract_first()

        # 抓取该游戏时间
        save_time = None

        msg = (id, name, price, response.url, metacritic_score,
               user_reviews_count, positive_user_reviews_count,
               positive_percent, negative_user_reviews_count,
               steam_user_reviews_count, non_steam_user_reviews_count,
               english_user_reviews_count, non_english_user_reviews_count,
               tag_list, achievements_count, category, genre, developer,
               publisher, release_date, language_number, description,
               save_time)

        command = (
            "INSERT IGNORE INTO {} "
            "(id, name, price, url, metacritic_score, user_reviews_count, positive_user_reviews_count, "
            "positive_percent, negative_user_reviews_count, steam_user_reviews_count, "
            "non_steam_user_reviews_count, english_user_reviews_count, non_english_user_reviews_count, "
            "tag_list, achievements_count, category, genre, developer, publisher, release_date, "
            "language_number, description, save_time)"
            "VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
            "%s)".format(config.steam_game_info_table))

        self.sql.insert_data(command, msg)

        command = "UPDATE {0} SET is_crawled=\'yes\' WHERE id=\'{1}\'".format(
            config.steam_game_urls_table, id)
        self.sql.execute(command)

    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))

    def get_id(self, url):
        type = ''
        if '/sub/' in url:
            pattern = re.compile('/sub/(\d+)/')
            type = 'sub'
        elif '/app/' in url:
            pattern = re.compile('/app/(\d+)/', re.S)
            type = 'app'
        elif '/bundle/' in url:
            pattern = re.compile('/bundle/(\d+)/', re.S)
            type = 'bundle'
        else:
            pattern = re.compile('/(\d+)/', re.S)
            type = 'other'
            utils.log('get_id other url:%s' % url)

        id = re.search(pattern, url)
        if id:
            id = id.group(1)
            return id

        self.error_count = self.error_count + 1
        utils.log('get_id error url:%s' % url)
        return -self.error_count

    def count_to_int(self, data):
        try:
            ret = data
            ret = ret.replace('(', '')
            ret = ret.replace(')', '')
            ret = ret.replace(',', '')

            return int(ret)
        except:
            return -1

    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()