Ejemplo n.º 1
0
    def __init__(self):
        global config

        self.confs_db = {
            'host': config.get('DB', 'host'),
            'user': config.get('DB', 'user'),
            'passwd': config.get('DB', 'passwd'),
            'db': config.get('DB', 'db'),
        }
        self.db = MysqlHandler(self.confs_db)

        self.urls = Urls()
        self.ua = UserAgents()
        self.httpheaders = {
            'Connection': 'keep-alive',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip,deflate,sdch',
            'Accept-Language': 'en-US,en;q=0.8',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Cache-Control': 'max-age=0',
            'Referer': 'http://www.google.com',
        }

        settings.overrides['CONCURRENT_REQUESTS'] = config.get(
            'SCRAPY', 'concurrent_requests')
        settings.overrides['CONCURRENT_REQUESTS_PER_DOMAIN'] = \
                                   config.get('SCRAPY', 'concurrent_requests_per_domain')
        settings.overrides['DOWNLOAD_TIMEOUT'] = config.get(
            'SCRAPY', 'download_timeout')
Ejemplo n.º 2
0
    def __init__(self):
        global configs_db
        global configs_scrapy

        self.confs_db = {
            'host': config.get('DB', 'host'),
            'user': config.get('DB', 'user'),
            'passwd': config.get('DB', 'passwd'),
            'db': config.get('DB', 'db'),
            }
        self.db = MysqlHandler(self.confs_db)

        self.urls = Urls()
        self.ua = UserAgents()
        self.httpheaders = {
            'Connection'     : 'keep-alive',
            'Accept'         : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip,deflate,sdch',
            'Accept-Language': 'en-US,en;q=0.8',
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Cache-Control'  : 'max-age=0',
            'Referer'        : 'http://www.google.com',
            }

        settings.overrides['CONCURRENT_REQUESTS'] = config.get('SCRAPY', 'concurrent_requests')
        settings.overrides['CONCURRENT_REQUESTS_PER_DOMAIN'] = config.get('SCRAPY', 'concurrent_requests_per_domain')
        settings.overrides['DOWNLOAD_TIMEOUT'] = config.get('SCRAPY', 'download_timeout')
Ejemplo n.º 3
0
    def __init__(self):
        global config

        self.confs_db = {
            "host": config.get("DB", "host"),
            "user": config.get("DB", "user"),
            "passwd": config.get("DB", "passwd"),
            "db": config.get("DB", "db"),
        }
        self.db = MysqlHandler(self.confs_db)

        self.urls = Urls()
        self.ua = UserAgents()
        self.httpheaders = {
            "Connection": "keep-alive",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Encoding": "gzip,deflate,sdch",
            "Accept-Language": "en-US,en;q=0.8",
            "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
            "Cache-Control": "max-age=0",
            "Referer": "http://www.google.com",
        }

        settings.overrides["CONCURRENT_REQUESTS"] = config.get("SCRAPY", "concurrent_requests")
        settings.overrides["CONCURRENT_REQUESTS_PER_DOMAIN"] = config.get("SCRAPY", "concurrent_requests_per_domain")
        settings.overrides["DOWNLOAD_TIMEOUT"] = config.get("SCRAPY", "download_timeout")
Ejemplo n.º 4
0
class Crawler(BaseSpider):

    name = 'crawler'

    def __init__(self):
        global config

        self.confs_db = {
            'host': config.get('DB', 'host'),
            'user': config.get('DB', 'user'),
            'passwd': config.get('DB', 'passwd'),
            'db': config.get('DB', 'db'),
        }
        self.db = MysqlHandler(self.confs_db)

        self.urls = Urls()
        self.ua = UserAgents()
        self.httpheaders = {
            'Connection': 'keep-alive',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip,deflate,sdch',
            'Accept-Language': 'en-US,en;q=0.8',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Cache-Control': 'max-age=0',
            'Referer': 'http://www.google.com',
        }

        settings.overrides['CONCURRENT_REQUESTS'] = config.get(
            'SCRAPY', 'concurrent_requests')
        settings.overrides['CONCURRENT_REQUESTS_PER_DOMAIN'] = \
                                   config.get('SCRAPY', 'concurrent_requests_per_domain')
        settings.overrides['DOWNLOAD_TIMEOUT'] = config.get(
            'SCRAPY', 'download_timeout')

    def start_requests(self):
        headers = self.httpheaders
        for url in self.urls.get_urls():
            ua = json.loads(self.ua.get_ua())['0']
            headers['User-Agent'] = ua

            # 在这里可添加些要传给 callback 的参数,加入 meta 中, 如
            # meta = {'name': 'flyer', 'url_refer': url, }
            # 也可以添加 cookies 信息,如
            # cookies = {'provinceId': 2, }
            meta = {}
            cookies = {}

            # 使用时可根据需求修改下 callback 的函数名,但不要设置为 'parse' 即可
            yield Request(url,
                          headers=headers,
                          meta=meta,
                          cookies=cookies,
                          callback=self.parse_origin)

    def parse_origin(self, response):
        """处理入口 url 返回的响应"""
        pass
Ejemplo n.º 5
0
class Crawler(BaseSpider):
    
    name = 'crawler'

    def __init__(self):
        global config

        self.confs_db = {
            'host'  : config.get('DB', 'host'),
            'user'  : config.get('DB', 'user'),
            'passwd': config.get('DB', 'passwd'),
            'db'    : config.get('DB', 'db'),
            }
        self.db = MysqlHandler(self.confs_db)

        self.urls = Urls()
        self.ua = UserAgents()
        self.httpheaders = {
            'Connection'     : 'keep-alive',
            'Accept'         : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip,deflate,sdch',
            'Accept-Language': 'en-US,en;q=0.8',
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Cache-Control'  : 'max-age=0',
            'Referer'        : 'http://www.google.com',
            }

        settings.overrides['CONCURRENT_REQUESTS'] = config.get('SCRAPY', 'concurrent_requests')
        settings.overrides['CONCURRENT_REQUESTS_PER_DOMAIN'] = \
                                   config.get('SCRAPY', 'concurrent_requests_per_domain')
        settings.overrides['DOWNLOAD_TIMEOUT'] = config.get('SCRAPY', 'download_timeout')

    def start_requests(self):
        headers = self.httpheaders
        for url in self.urls.get_urls():
            ua = json.loads(self.ua.get_ua())['0']
            headers['User-Agent'] = ua

            # 在这里可添加些要传给 callback 的参数,加入 meta 中, 如
            # meta = {'name': 'flyer', 'url_refer': url, }
            # 也可以添加 cookies 信息,如
            # cookies = {'provinceId': 2, }
            meta    = {}
            cookies = {}
            
            # 使用时可根据需求修改下 callback 的函数名,但不要设置为 'parse' 即可
            yield Request(url, headers=headers, meta=meta, cookies=cookies, callback=self.parse_origin)

    def parse_origin(self, response):
        """处理入口 url 返回的响应"""
        pass
Ejemplo n.º 6
0
class Crawler(BaseSpider):

    name = "crawler"

    def __init__(self):
        global config

        self.confs_db = {
            "host": config.get("DB", "host"),
            "user": config.get("DB", "user"),
            "passwd": config.get("DB", "passwd"),
            "db": config.get("DB", "db"),
        }
        self.db = MysqlHandler(self.confs_db)

        self.urls = Urls()
        self.ua = UserAgents()
        self.httpheaders = {
            "Connection": "keep-alive",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Encoding": "gzip,deflate,sdch",
            "Accept-Language": "en-US,en;q=0.8",
            "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
            "Cache-Control": "max-age=0",
            "Referer": "http://www.google.com",
        }

        settings.overrides["CONCURRENT_REQUESTS"] = config.get("SCRAPY", "concurrent_requests")
        settings.overrides["CONCURRENT_REQUESTS_PER_DOMAIN"] = config.get("SCRAPY", "concurrent_requests_per_domain")
        settings.overrides["DOWNLOAD_TIMEOUT"] = config.get("SCRAPY", "download_timeout")

    def start_requests(self):
        headers = self.httpheaders
        for url in self.urls.get_urls():
            ua = json.loads(self.ua.get_ua())["0"]
            headers["User-Agent"] = ua

            # 在这里可添加些要传给 callback 的参数,加入 meta 中, 如
            # meta = {'name': 'flyer', 'url_refer': url, }
            # 也可以添加 cookies 信息,如
            # cookies = {'provinceId': 2, }
            meta = {}
            cookies = {}

            # 使用时可根据需求修改下 callback 的函数名,但不要设置为 'parse' 即可
            yield Request(url, headers=headers, meta=meta, cookies=cookies, callback=self.parse_origin)

    def parse_origin(self, response):
        """处理入口 url 返回的响应"""
        pass