def __init__(self, *args, **kwargs): super(MarketSpider, self).__init__(*args, **kwargs) self._baseclass = MarketSpider self.configure_request_sharing() db.init(dbsettings) if not hasattr(self, 'request_queue_chunk'): self.request_queue_chunk = 100 if 'dao' in kwargs: self.dao = kwargs['dao'] else: self.dao = self.make_dao() self.set_timezone() try: self.market = Market.get(spider=self.name) except: raise Exception( "No market entry exist in the database for spider %s" % self.name) if not hasattr( self._baseclass, '_cache_preloaded') or not self._baseclass._cache_preloaded: self.dao.cache.reload(User, User.market == self.market) self._baseclass._cache_preloaded = True self.register_new_scrape() self.start_statistics() self.manual_input = None self.request_after_manual_input = None
def __init__(self, *args, **kwargs): super(ForumSpider, self).__init__( *args, **kwargs) self._baseclass = ForumSpider self.configure_request_sharing() db.init(dbsettings) if 'dao' in kwargs: self.dao = kwargs['dao'] else: self.dao = self.make_dao() self.set_timezone() try: self.forum = Forum.get(spider=self.name) except: raise Exception("No forum entry exist in the database for spider " + self.name) if not hasattr(self._baseclass, '_cache_preloaded') or not self._baseclass._cache_preloaded: self.dao.cache.reload(User, User.forum == self.forum) self.dao.cache.reload(Thread, Thread.forum == self.forum) self._baseclass._cache_preloaded = True self.request_after_manual_input = None self.register_new_scrape() self.start_statistics() self.manual_input = None
def setUp(self): configure_logging(install_root_handler=False) logging.basicConfig( format='%(levelname)s: %(message)s', level=logging.INFO ) self.spider = MockedSpider(); self.dao = DatabaseDAO('markets') db.init(dbsetting)
def setUp(self): db.init(dbsetting)
if not issubclass(spcls, ForumSpider): raise Exception('Spider %s is not a Forum Spider. Please use the right script for your spider.' % spider_name) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--spider', required=True, help='The spider name to launch') parser.add_argument('--instances', default=1, type=int, help='Number of instance of the spider to launch') parser.add_argument('--login', nargs='*', help='List of logins to use by the spider. Each item represent to name of the key in the spider settings file.') parser.add_argument('--mode', choices=['crawl', 'replay'], default='crawl', help='Select the crawl mode. When "crawl", download all pages from target website. When "replay", uses the downlaoded response in the HTTP cache.') args = parser.parse_args() settings = get_project_settings() assert_good_spider_type(settings, args.spider) db.init(dbsettings); settings.set('login', args.login) # List of allowed login to use settings.set('MODE', args.mode) # replay : use filesystem cache to read response crawlerprocess = CrawlerProcess(settings) dbprocess = start_dbprocess() # Create an Process entry in the database. We'll pass this object to the spider so we knows they have been launched together. spider_attributes = { 'process' : dbprocess, 'dao' : ForumSpider.make_dao() # DAO is shared between spider. } for i in range(0,args.instances): crawlerprocess.crawl(args.spider, **spider_attributes)
def __init__(self, *args, **kwargs): super(ChangerateSpider, self).__init__(*args, **kwargs) db.init(dbsettings) self.download_delay = 60 * 60 self.max_concurrent_requests = 1