def __init__(self, *args, **kw): # get extra parameters of scraper launch cmd SCRAPED_DOMAIN = kw.pop('SCRAPED_DOMAIN', None) START_URL = kw.pop('START_URL', None) LOCAL_DOMAIN = kw.pop('LOCAL_DOMAIN', None) if START_URL: self.start_urls = [START_URL] if SCRAPED_DOMAIN: if SCRAPED_DOMAIN.strip('.')[0] == 'www': self.allowed_domains = [SCRAPED_DOMAIN, SCRAPED_DOMAIN.replace('www.', '')] else: self.allowed_domains = [SCRAPED_DOMAIN, 'www.%s' % SCRAPED_DOMAIN] if LOCAL_DOMAIN is None: raise GrabberSpiderError('No local_url is specified for job') super(GrabberSpider, self).__init__(*args, **kw) log.msg('Init SQL alchemy engine', level=log.DEBUG) engine = engine_from_config(WEB_APP_SETTINGS, 'sqlalchemy.') conn = engine.connect() self.dbsession = Session(bind=conn) # patch orm objects to use this local session object Base.metadata.create_all(engine) # while use creating DB here if not self.check_local_domain_uniqueness(LOCAL_DOMAIN): raise GrabberSpiderError('%s is already used in db') q = self.dbsession.query(WebSite)\ .filter(WebSite.original_url == SCRAPED_DOMAIN) website = q.first() if website is None: website = WebSite(original_url=SCRAPED_DOMAIN, local_domain=LOCAL_DOMAIN) self.dbsession.add(website) self.dbsession.commit() self.website = website ''' Check directory for media and create it if it does not exist ''' media_dir = WEB_APP_SETTINGS.get('downloaded.path') if media_dir: if not os.path.exists(media_dir): os.mkdir(media_dir) else: raise Exception('Directory for downloaded media is not specified in settings') ''' Check id downloaded media url is in application settings ''' if WEB_APP_SETTINGS.get('downloaded.url') is None: raise Exception('URL for downloaded media is not specified')
def __init__(self, *args, **kw): super(GrabMediaPipeline, self).__init__(*args, **kw) self.media_store_path = WEB_APP_SETTINGS.get('downloaded.path') self.media_local_url = WEB_APP_SETTINGS.get('downloaded.url') self.PAGE_MEDIA = {} self.LOCAL_MEDIA_URI = {}
def __init__(self, *args, **kw): # get extra parameters of scraper launch cmd SCRAPED_DOMAIN = kw.pop('SCRAPED_DOMAIN', None) START_URL = kw.pop('START_URL', None) LOCAL_DOMAIN = kw.pop('LOCAL_DOMAIN', None) if START_URL: self.start_urls = [START_URL] if SCRAPED_DOMAIN: if SCRAPED_DOMAIN.strip('.')[0] == 'www': self.allowed_domains = [ SCRAPED_DOMAIN, SCRAPED_DOMAIN.replace('www.', '') ] else: self.allowed_domains = [ SCRAPED_DOMAIN, 'www.%s' % SCRAPED_DOMAIN ] if LOCAL_DOMAIN is None: raise GrabberSpiderError('No local_url is specified for job') super(GrabberSpider, self).__init__(*args, **kw) log.msg('Init SQL alchemy engine', level=log.DEBUG) engine = engine_from_config(WEB_APP_SETTINGS, 'sqlalchemy.') conn = engine.connect() self.dbsession = Session(bind=conn) # patch orm objects to use this local session object Base.metadata.create_all(engine) # while use creating DB here if not self.check_local_domain_uniqueness(LOCAL_DOMAIN): raise GrabberSpiderError('%s is already used in db') q = self.dbsession.query(WebSite)\ .filter(WebSite.original_url == SCRAPED_DOMAIN) website = q.first() if website is None: website = WebSite(original_url=SCRAPED_DOMAIN, local_domain=LOCAL_DOMAIN) self.dbsession.add(website) self.dbsession.commit() self.website = website ''' Check directory for media and create it if it does not exist ''' media_dir = WEB_APP_SETTINGS.get('downloaded.path') if media_dir: if not os.path.exists(media_dir): os.mkdir(media_dir) else: raise Exception( 'Directory for downloaded media is not specified in settings') ''' Check id downloaded media url is in application settings ''' if WEB_APP_SETTINGS.get('downloaded.url') is None: raise Exception('URL for downloaded media is not specified')