def __init__(self): # 汎用データ辞書/リスト self.generalData_dict = dict() self.generalData_list = list() self.setup_hooks() # フックセットアップ self.setup_domains() # ドメイン名セットアップ # クロールスパイダーを初期化(最後にするのが肝) CrawlSpider.__init__(self)
def __init__(self): self.domain = "www.gsmarena.com" self.name = "gsmarena" self.custom_settings = {} self.allowed_domains = ["www.gsmarena.com"] CrawlSpider.__init__(self) self.start_urls = ["http://www.gsmarena.com/","http://www.gsmarena.com/makers.php3"] self.count = 0 self.deny = "" self.crawl_limt = 0 self.real_count = 0
def __init__(self): self.domain = "www.gsmarena.com" self.name = "gsmarena" self.custom_settings = {} self.allowed_domains = ["www.gsmarena.com"] CrawlSpider.__init__(self) self.start_urls = ["http://www.gsmarena.com/makers.php3", "http://www.gsmarena.com/acer-phones-59.php", "http://www.gsmarena.com/alcatel-phones-5.php"] self.count = 0 self.deny = "" self.crawl_limt = 0 self.real_count = 0 self.batch_size = 300 self.mobile_product = []
def __init__(self, **kwargs): ''' :param kwargs: Read user arguments and initialize variables ''' CrawlSpider.__init__(self) self.outDir = kwargs['outDir'] self.startYear = kwargs['startYear'] self.endYear = kwargs['endYear'] print('startYear: ', self.startYear) print('self.endYear: ', self.endYear) print('self.outDir: ', self.outDir) self.headers = ({'User-Agent': 'Mozilla/5.0', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest'}) self.payload = {'username': '******', 'password': '******'} self.apikey = '[API Key for Gigya]' self.categoryID = 'Production'
def __init__(self, rule, worksheet, logging): CrawlSpider.__init__(self) # use any browser you wish self.browser = webdriver.Firefox() self.logging = logging self.rule = rule self.name = self.rule["ranking_name"] self.logging.info("==============================") self.logging.info("self.rule[start_urls]: %s" % self.rule["start_urls"]) self.start_urls = self.rule["start_urls"] # slef.next_page is a defined array. self.next_page = self.rule["next_page"] \ if ("next_page" in self.rule) else ["NONE"] self.logging.info("#### self.next_page %s" % self.next_page) self.flag = self.rule["flag"] \ if ("flag" in self.rule) else ["NONE"] self.logging.info("#### self.flag %s" % self.flag) self.worksheet = worksheet self.logging.info("Finish the __init__ method ... ")
def __init__(self): CrawlSpider.__init__(self) #create database try : dbfile = '%s/%s' % (conf.PROJECT_PATH['data'], conf.SQLITE['file']) if os.path.exists(dbfile): moveto = '%s.%d' % (dbfile, int(time.time())) shutil.move(dbfile, moveto) print 'old db file %s is moved to %s.' % (dbfile, moveto) conn = sqlite3.connect(dbfile) cursor = conn.cursor() for table in conf.SQLITE['tables']: cursor.execute(table['sql']) conn.commit() print 'db initialization complete!' finally: conn.close()
def __init__(self): CrawlSpider.__init__(self) connection = pymongo.MongoClient(settings['MONGODB_HOST'], settings['MONGODB_PORT']) db = connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']]
def __init__(self, *args, **kwargs): CrawlSpider.__init__(self, *args, **kwargs) self.proxy_pool = proxy_list
def test_crawl_spider(self): assert issubclass(CrawlSpider, Spider) assert isinstance(CrawlSpider(name="foo"), Spider)
def parse(self, response): return CrawlSpider.parse(self, response)
def __init__(self, *arg, **karg): self.init_yaml('scrapy_service/templates/product.yaml','lazada_sitemap') CrawlSpider.__init__(self, *arg)
def __init__(self, *a, **kw): CrawlSpider.__init__(self, *a, **kw) self.crawledurl = set() self.itemIds = set()
def __init__(self): CrawlSpider.__init__(self, self.name) self.driver = create_bs_driver() self.driver.set_page_load_timeout(20) self.num = ''
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = webdriver.Firefox(executable_path="/Users/theodoreshih/Downloads/geckodriver") #("localhost", 4444, "*chrome", "http://www.domain.com") self.selenium.get('https://www.greenrush.com/dispensary/cannabis-express')
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = webdriver.Firefox()
def __init__(self, start_date, end_date): CrawlSpider.__init__(self) SpiderBase.__init__(self, 'http://www.wickerparkbucktown.com/', start_date, end_date, date_format = '%B %d, %Y')
def __init__(self, **kwargs): CrawlSpider.__init__(self, **kwargs)
def __init__(self): CrawlSpider.__init__(self) # use any browser you wish self.browser = webdriver.Firefox()
def __init__(self): CrawlSpider.__init__(self) BaseCrawler.__init__(self)
def __init__(self, xpath_dict={}, files=None): CrawlSpider.__init__(self) self.xpath_dict = xpath_dict self.from_url_file = files if self.from_url_file: self.crawl_from_files()
def __init__(self, *arg, **karg): self.name = karg['name'] self.init_yaml('scrapy_service/templates/product.yaml', self.name) CrawlSpider.__init__(self, *arg)
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] # self.selenium=selenium('localhost',4444,"*chrome") self.driver = webdriver.Firefox()
def __init__(self, *args, **kwargs): CrawlSpider.__init__(self) SpiderBase.__init__(*args, **kwargs)
def test_crawl_spider(self): assert issubclass(CrawlSpider, Spider) assert issubclass(CrawlSpider, BaseSpider) assert isinstance(CrawlSpider(name='foo'), Spider) assert isinstance(CrawlSpider(name='foo'), BaseSpider)
def set_crawler(self, crawler): print 'call set_crawler' CrawlSpider.set_crawler(self, crawler)
def _requests_to_follow(self, response): if getattr(response, "encoding", None) != None: return CrawlSpider._requests_to_follow(self, response) else: return []
def __init__(self, *args, **kwargs): CrawlSpider.__init__(self, *args, **kwargs)
def __init__(self, *arg, **karg): self.name = karg['name'] self.init_yaml('scrapy_service/templates/product.yaml',self.name) CrawlSpider.__init__(self, *arg)
# -*- coding: utf-8 -*- """ CrawlSpider(class scrapy.spiders.CrawlSpider) Spider类: 只爬取start_urls列表中的网页 CrawlSpider类: 定义了一些规则(rule)来提供跟进link的机制,从爬取的网页中获取link并继续爬取 LinkExtractors类: 用于提取链接,该类的extract_links()方法接收一个Response对象,返回一个scrapy.link.Link对象 该类需要实例化一次,并且extract_links()方法会根据不同的response调用多次提取链接 class scrapy.linkextractors.LinkExtractor( allow = (), # 满足括号中'正则表达式'的值会被提取,如果为空则全部匹配(常用) deny = (), # 与这个正则表达式不匹配的URL一定不提取 allow_domains = (), # 会被提取的链接的domains(常用) deny_domains = (), # 一定不会被提取链接的domains deny_extensions = None, restrict_xpaths = (), # 使用xpath表达式,和allow共同作用过滤链接 tags = ('a','area'), attrs = ('href'), canonicalize = True, unique = True, process_value = None ) rules: 包含一个或多个Rule对象,每个Rule对爬取网站的动作定义了特定操作 class scrapy.spiders.Rule( link_extractor, # 是一个LinkExtractor对象,定义需要提取的链接 callback = None, # 从link_extractor中每获取到新的链接时,指定回调函数处理response响应的数据 cb_kwargs = None,
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors=[] # self.selenium=selenium('localhost',4444,"*chrome") self.driver=webdriver.Firefox()
def __init__(self, **kwargs): LrmiBase.__init__(self, **kwargs) CrawlSpider.__init__(self, **kwargs)
def __del__(self): self.driver.close() print self.verificationErrors CrawlSpider.__del__(self)
def __init__(self, start_date, end_date): CrawlSpider.__init__(self) SpiderBase.__init__(self, 'https://www.chicagohistory.org/', start_date, end_date, date_format = '%d %B %Y', request_date_format = '%Y%m%d')
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com") self.selenium.start()
def parse(self, response): for res in CrawlSpider.parse(self, response): yield self.get_request(res.url)
def set_crawler(self, crawer): CrawlSpider.set_crawler(self, crawer) # ÉèÖÃĬÈÏÅÀÈ¥ RedisMixin.setup_redis(self) # urlÓÉredis
def __init__(self, *args, **kwargs): CrawlSpider.__init__(self) if 'mining_job_id' in kwargs: self.mining_job_id = kwargs['mining_job_id'] if 'site_id' in kwargs: self.site_id = kwargs['site_id'] if 'preview' in kwargs: self.preview = 1 if 'iteration' in kwargs: self.iteration = kwargs['iteration'] if 'management_node' in kwargs: self.management_node = kwargs['management_node'] if 'username' in kwargs: self.username = kwargs['username'] if 'password' in kwargs: self.password = kwargs['password'] if 'proxy' in kwargs: self.proxy = kwargs['proxy'] if 'robots_obey' in kwargs: settings.set('ROBOTSTXT_OBEY', int(kwargs['robots_obey']), priority='cmdline') if 'url' in kwargs: self.start_urls.append(kwargs['url'] + self.url_fragmentanchor) if 'extract' in kwargs: self.extract = kwargs['extract'] if 'maxjobs' in kwargs: self.maxjobs = int(kwargs['maxjobs']) if 'protocol' in kwargs: self.protocol = kwargs['protocol'] if 'maximum_try' in kwargs: self.maximum_try = kwargs['maximum_try'] if 'on_demand' in kwargs: self.on_demand = kwargs['on_demand'] if 'debug_id' in kwargs: self.debug_id = kwargs['debug_id'] if 'stale_limit_seconds' in kwargs: self.stale_limit = int(kwargs['stale_limit_seconds']) if 'subspider_detector' in kwargs: self.subspider_detector = True self.required_fields = self.subspider_detect_fields # Sending max items to be scraped. if 'max_items_count' in kwargs: self.max_items_count = int(kwargs['max_items_count']) # set spider_valid_cutoff, default 80 percent of max_items_count spider_valid_cutoff = kwargs.get("valid_cutoff") if spider_valid_cutoff: self.spider_valid_cutoff = int(spider_valid_cutoff) else: self.spider_valid_cutoff = int(self.max_items_count * 0.8) # this will reduce extra requstes after a close_spider call settings.overrides['CONCURRENT_REQUESTS'] = 1 self.debug = int(kwargs.get('debug', '0')) if 'download_delay' in kwargs or hasattr(self, 'download_delay'): download_delay = float(kwargs.get('download_delay', getattr(self, 'download_delay', 0))) settings.set('DOWNLOAD_DELAY', download_delay, priority='cmdline') if download_delay > 0: settings.set('AUTOTHROTTLE_ENABLED', True, priority='cmdline') if self.allowed_domain_bynetloc: self.allowed_domains.append(urlparse.urlparse(kwargs['url']).netloc) # set list of domain allowed to crawl self.default_job_field_getters.update({ 'url': lambda self, response, item: response.url, 'date': lambda self, response, item: datetime.now().strftime('%Y/%m/%d'), 'language': lambda self, response, item: self.language if hasattr(self, 'language') else None }) if self.extract_logo: self.default_job_field_getters.update({'autoextracted_logo_urls': self.get_logos}) if self.extract_email: self.default_job_field_getters.update({'autoextracted_emails': self.get_emails}) if self.extract_salary: self.default_job_field_getters.update({'autoextracted_salaries': self.get_salaries}) if self.extract_website: self.default_job_field_getters.update({'autoextracted_company_websites': self.get_websites}) self.default_fields = self.default_job_field_getters.keys() self.validate_parse_job_wrapper = validate(fields_to_check=self.required_fields)(type(self).parse_job_wrapper) dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def __del__(self): self.selenium.stop() print self.verificationErrors CrawlSpider.__del__(self)
def _set_crawler(self, crawler): CrawlSpider._set_crawler(self, crawler) RedisMixin.setup_redis(self)
def __init__(self): CrawlSpider.__init__(self)
def __init__(self, *a, **kw): """Init BaseSpider with storage configuration""" CrawlSpider.__init__(self, *a, **kw) self.source_name = self.get_source_name() self.storage = get_storage(self.source_name)
def set_crawler(self, crawer): CrawlSpider.set_crawler(self, crawer) # 设置默认爬去 RedisMixin.setup_redis(self) # url由redis
def __init__(self, *args, **kwargs): BaseSpider.__init__(self, *args, **kwargs) CS.__init__(self, *args, **kwargs)