Exemple #1
0
    def __init__(self, *args, **kw):

        # get extra parameters of scraper launch cmd
        SCRAPED_DOMAIN = kw.pop('SCRAPED_DOMAIN', None)
        START_URL = kw.pop('START_URL', None)
        LOCAL_DOMAIN = kw.pop('LOCAL_DOMAIN', None)

        if START_URL:
            self.start_urls = [START_URL]

        if SCRAPED_DOMAIN:
            if SCRAPED_DOMAIN.strip('.')[0] == 'www':
                self.allowed_domains = [SCRAPED_DOMAIN,
                    SCRAPED_DOMAIN.replace('www.', '')]
            else:
                self.allowed_domains = [SCRAPED_DOMAIN,
                                'www.%s' % SCRAPED_DOMAIN]

        if LOCAL_DOMAIN is None:
            raise GrabberSpiderError('No local_url is specified for job')

        super(GrabberSpider, self).__init__(*args, **kw)
        log.msg('Init SQL alchemy engine', level=log.DEBUG)
        engine = engine_from_config(WEB_APP_SETTINGS, 'sqlalchemy.')
        conn = engine.connect()
        self.dbsession = Session(bind=conn)

        # patch orm objects to use this local session object

        Base.metadata.create_all(engine)  # while use creating DB here

        if not self.check_local_domain_uniqueness(LOCAL_DOMAIN):
            raise GrabberSpiderError('%s is already used in db')

        q = self.dbsession.query(WebSite)\
            .filter(WebSite.original_url == SCRAPED_DOMAIN)
        website = q.first()
        if website is None:
            website = WebSite(original_url=SCRAPED_DOMAIN,
                local_domain=LOCAL_DOMAIN)
            self.dbsession.add(website)
            self.dbsession.commit()
        self.website = website

        '''
            Check directory for media and create it if it does not exist
        '''
        media_dir = WEB_APP_SETTINGS.get('downloaded.path')
        if media_dir:
            if not os.path.exists(media_dir):
                os.mkdir(media_dir)
        else:
            raise Exception('Directory for downloaded media is not specified in settings')

        '''
            Check id downloaded media url is in application settings
        '''
        if WEB_APP_SETTINGS.get('downloaded.url') is None: raise Exception('URL for downloaded media is not specified')
Exemple #2
0
 def __init__(self, *args, **kw):
     super(GrabMediaPipeline, self).__init__(*args, **kw)
     self.media_store_path = WEB_APP_SETTINGS.get('downloaded.path')
     self.media_local_url = WEB_APP_SETTINGS.get('downloaded.url')
     self.PAGE_MEDIA = {}
     self.LOCAL_MEDIA_URI = {}
Exemple #3
0
    def __init__(self, *args, **kw):

        # get extra parameters of scraper launch cmd
        SCRAPED_DOMAIN = kw.pop('SCRAPED_DOMAIN', None)
        START_URL = kw.pop('START_URL', None)
        LOCAL_DOMAIN = kw.pop('LOCAL_DOMAIN', None)

        if START_URL:
            self.start_urls = [START_URL]

        if SCRAPED_DOMAIN:
            if SCRAPED_DOMAIN.strip('.')[0] == 'www':
                self.allowed_domains = [
                    SCRAPED_DOMAIN,
                    SCRAPED_DOMAIN.replace('www.', '')
                ]
            else:
                self.allowed_domains = [
                    SCRAPED_DOMAIN, 'www.%s' % SCRAPED_DOMAIN
                ]

        if LOCAL_DOMAIN is None:
            raise GrabberSpiderError('No local_url is specified for job')

        super(GrabberSpider, self).__init__(*args, **kw)
        log.msg('Init SQL alchemy engine', level=log.DEBUG)
        engine = engine_from_config(WEB_APP_SETTINGS, 'sqlalchemy.')
        conn = engine.connect()
        self.dbsession = Session(bind=conn)

        # patch orm objects to use this local session object

        Base.metadata.create_all(engine)  # while use creating DB here

        if not self.check_local_domain_uniqueness(LOCAL_DOMAIN):
            raise GrabberSpiderError('%s is already used in db')

        q = self.dbsession.query(WebSite)\
            .filter(WebSite.original_url == SCRAPED_DOMAIN)
        website = q.first()
        if website is None:
            website = WebSite(original_url=SCRAPED_DOMAIN,
                              local_domain=LOCAL_DOMAIN)
            self.dbsession.add(website)
            self.dbsession.commit()
        self.website = website
        '''
            Check directory for media and create it if it does not exist
        '''
        media_dir = WEB_APP_SETTINGS.get('downloaded.path')
        if media_dir:
            if not os.path.exists(media_dir):
                os.mkdir(media_dir)
        else:
            raise Exception(
                'Directory for downloaded media is not specified in settings')
        '''
            Check id downloaded media url is in application settings
        '''
        if WEB_APP_SETTINGS.get('downloaded.url') is None:
            raise Exception('URL for downloaded media is not specified')
Exemple #4
0
 def __init__(self, *args, **kw):
     super(GrabMediaPipeline, self).__init__(*args, **kw)
     self.media_store_path = WEB_APP_SETTINGS.get('downloaded.path')
     self.media_local_url = WEB_APP_SETTINGS.get('downloaded.url')
     self.PAGE_MEDIA = {}
     self.LOCAL_MEDIA_URI = {}