Exemple #1
0
    def __init__(self, spidercls, settings):
        if isinstance(settings, dict):
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        self.signals.connect(lambda: logging.root.removeHandler(handler),
                             signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.spidercls.update_settings(self.settings)
        self.settings.freeze()

        self.crawling = False
        self.spider = None
        self.engine = None
Exemple #2
0
def create_root(config):
    from scrapy import log
    from scrapy.settings import Settings
    from slyd.crawlerspec import (CrawlerSpecManager,
        create_crawler_spec_resource)
    from slyd.bot import create_bot_resource
    import slyd.settings
    from slyd.projects import ProjectsResource

    root = Resource()
    root.putChild("static", File(config['docroot']))

    crawler_settings = Settings()
    crawler_settings.setmodule(slyd.settings)
    spec_manager = CrawlerSpecManager(crawler_settings)

    # add project management at /projects
    projects = ProjectsResource(crawler_settings)
    root.putChild('projects', projects)

    # add crawler at /projects/PROJECT_ID/bot
    log.msg("Slybot specs loading from %s/[PROJECT]" % spec_manager.basedir,
        level=log.DEBUG)
    projects.putChild("bot", create_bot_resource(spec_manager))

    # add spec at /projects/PROJECT_ID/spec
    spec = create_crawler_spec_resource(spec_manager)
    projects.putChild("spec", spec)
    return root
Exemple #3
0
def get_fetch(log=False):
    settings = Settings()
    settings.set('LOG_ENABLED', log)

    crawler_process = CrawlerProcess(settings)
    crawler = crawler_process.create_crawler()
    crawler_process.start_crawling()

    t = Thread(target=crawler_process.start_reactor)
    t.daemon = True
    t.start()

    shell = Shell(crawler)
    shell.code = 'adsf'

    import threading
    lock = threading.Lock()

    def fetch(url_or_request):
        lock.acquire()
        try:
            shell.fetch(url_or_request)
            response = shell.vars.get('response')
            return response
        finally:
            lock.release()

    return fetch
def test_from_settings_constructs_middleware_with_the_specified_settings():
    settings = Settings()
    settings.set('HTML_STORAGE', {'test': 'settings'})

    downloader = HtmlStorageMiddleware.from_settings(settings)

    assert_that(downloader.settings, is_({'test': 'settings'}))
def make_downloader(save_html_on_codes=[]):
    settings = Settings()
    settings.set('HTML_STORAGE', {
        'gzip_output': True,
        'save_html_on_codes': save_html_on_codes
    })
    return HtmlStorageMiddleware(settings)
def get_project_settings():
    scrapy_module = "uris.urispider.settings"
    
    settings = Settings()
    settings.setmodule(scrapy_module)
    
    return settings
Exemple #7
0
def create_spec_manager(projects_dir=None):
    """Create a CrawlerSpecManager configured to use test settings"""
    crawler_settings = ScrapySettings()
    crawler_settings.setmodule(test_settings)
    projects_dir = projects_dir or test_settings.SPEC_DATA_DIR
    test_settings.SPEC_FACTORY['PARAMS']['location'] = projects_dir
    return SpecManager(crawler_settings)
Exemple #8
0
    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None
Exemple #9
0
 def test_autopromote_dicts(self):
     settings = Settings()
     mydict = settings.get('TEST_DICT')
     self.assertIsInstance(mydict, BaseSettings)
     self.assertIn('key', mydict)
     self.assertEqual(mydict['key'], 'val')
     self.assertEqual(mydict.getpriority('key'), 0)
Exemple #10
0
 def test_getdict_autodegrade_basesettings(self):
     settings = Settings()
     mydict = settings.getdict('TEST_DICT')
     self.assertIsInstance(mydict, dict)
     self.assertEqual(len(mydict), 1)
     self.assertIn('key', mydict)
     self.assertEqual(mydict['key'], 'val')
Exemple #11
0
class FilesPipelineTestCaseCustomSettings(unittest.TestCase):

    def setUp(self):
        self.tempdir = mkdtemp()
        self.pipeline = FilesPipeline(self.tempdir)
        self.default_settings = Settings()

    def tearDown(self):
        rmtree(self.tempdir)

    def test_expires(self):
        another_pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': self.tempdir,
                                                                'FILES_EXPIRES': 42}))
        self.assertEqual(self.pipeline.expires, self.default_settings.getint('FILES_EXPIRES'))
        self.assertEqual(another_pipeline.expires, 42)

    def test_files_urls_field(self):
        another_pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': self.tempdir,
                                                                'FILES_URLS_FIELD': 'funny_field'}))
        self.assertEqual(self.pipeline.files_urls_field, self.default_settings.get('FILES_URLS_FIELD'))
        self.assertEqual(another_pipeline.files_urls_field, 'funny_field')

    def test_files_result_field(self):
        another_pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': self.tempdir,
                                                                'FILES_RESULT_FIELD': 'funny_field'}))
        self.assertEqual(self.pipeline.files_result_field, self.default_settings.get('FILES_RESULT_FIELD'))
        self.assertEqual(another_pipeline.files_result_field, 'funny_field')
Exemple #12
0
    def __init__(self, store_uri, download_func=None, settings=None):
        if not store_uri:
            raise NotConfigured
        
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        cls_name = "FilesPipeline"
        self.store = self._get_store(store_uri)
        resolve = functools.partial(self._key_for_pipe,
                                    base_class_name=cls_name,
                                    settings=settings)
        self.expires = settings.getint(
            resolve('FILES_EXPIRES'), self.EXPIRES
        )
        if not hasattr(self, "FILES_URLS_FIELD"):
            self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD
        if not hasattr(self, "FILES_RESULT_FIELD"):
            self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD
        self.files_urls_field = settings.get(
            resolve('FILES_URLS_FIELD'), self.FILES_URLS_FIELD
        )
        self.files_result_field = settings.get(
            resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD
        )

        super(FilesPipeline, self).__init__(download_func=download_func)
Exemple #13
0
 def _crawl(self):
     settings = Settings()
     settings.set('ITEM_PIPELINES', {
         'app.pipelines.JsonWriterPipeline': 300
     })
     self.process = CrawlerProcess(settings)
     self.process.crawl(self, self.name, self.start_urls)
     self.process.start()
def test_contructor_sets_default_settings_values_when_no_settings_are_specified(
        setting_name, expected):
    settings = Settings()
    settings.set('HTML_STORAGE', {})

    downloader = HtmlStorageMiddleware(settings)

    assert_that(downloader.__dict__[setting_name], is_(expected))
Exemple #15
0
 def __init__(self, spider):
     Process.__init__(self)
     setting = Settings()
     setting.setmodule(s)
     self.crawler = Crawler(setting)
     self.crawler.configure()
     self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
     self.spider = spider
Exemple #16
0
 def __init__(self, settings):
     if isinstance(settings, dict):
         settings = Settings(settings)
     self.settings = settings
     smcls = load_object(settings['SPIDER_MANAGER_CLASS'])
     verifyClass(ISpiderManager, smcls)
     self.spiders = smcls.from_settings(settings.frozencopy())
     self.crawlers = set()
     self._active = set()
 def runSpider(self, spider):
     configure_logging({'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s'})
     settings = Settings()
     settings.set('FEED_URI', 'output.json')
     settings.set('FEED_FORMAT', 'json')
     
     runner = CrawlerRunner(settings)
     dfd = runner.crawl(spider)
     dfd.addBoth(lambda _: reactor.stop())
 def __init__(self):
     sets = Settings()
     sets.setmodule(settings, priority='project')
     connection = pymongo.MongoClient(
             sets['MONGODB_SERVER'],
             sets['MONGODB_PORT']
     )
     db = connection[sets['MONGODB_DB']]
     self.collection = db[sets['MONGODB_COLLECTION']]
Exemple #19
0
def create_root(config, settings_module):
    from scrapy.settings import Settings
    from .specmanager import SpecManager
    from .authmanager import AuthManager
    from .projectspec import create_project_resource
    from slyd.bot import create_bot_resource
    from slyd.projects import create_projects_manager_resource

    from slyd.splash.ferry import (FerryServerProtocol, FerryServerFactory,
                                   create_ferry_resource)
    from slyd.splash.proxy import ProxyResource

    root = Resource()
    static = Resource()
    for file_name in listdir(config['docroot']):
        file_path = join(config['docroot'], file_name)
        if isfile(file_path):
            static.putChild(file_name, File(file_path))
    static.putChild('main.html', File(join(config['docroot'], 'index.html')))

    root.putChild('static', static)
    root.putChild('assets', File(join(config['docroot'], 'assets')))
    root.putChild('fonts', File(join(config['docroot'], 'assets', 'fonts')))
    root.putChild('', File(join(config['docroot'], 'index.html')))

    settings = Settings()
    settings.setmodule(settings_module)
    spec_manager = SpecManager(settings)

    # add server capabilities at /server_capabilities
    capabilities = Capabilities(spec_manager)
    root.putChild('server_capabilities', capabilities)

    # add projects manager at /projects
    projects = create_projects_manager_resource(spec_manager)
    root.putChild('projects', projects)

    # add crawler at /projects/PROJECT_ID/bot
    projects.putChild('bot', create_bot_resource(spec_manager))

    # add project spec at /projects/PROJECT_ID/spec
    spec = create_project_resource(spec_manager)
    projects.putChild('spec', spec)

    # add websockets for communicating with splash
    factory = FerryServerFactory("ws://127.0.0.1:%s" % config['port'],
                                 debug=False,
                                 assets=config['docroot'])
    factory.protocol = FerryServerProtocol
    factory.setProtocolOptions(allowHixie76=True)
    websocket = create_ferry_resource(spec_manager, factory)
    root.putChild("ws", websocket)

    root.putChild('proxy', ProxyResource())

    auth_manager = AuthManager(settings)
    return auth_manager.protectResource(root)
def get_project_settings(module=None, custom_settings=None):
    crawler_settings = Settings()
    if module is None:
        module = settings.PROJECT_SETTINGS
    crawler_settings.setmodule(module, priority='project')
    if custom_settings:
        assert isinstance(custom_settings, dict)
        crawler_settings.setdict(custom_settings, priority='cmdline')
    return crawler_settings
def import_from_old(request):
    from scrapy.settings import Settings
    from scrapy.crawler import CrawlerProcess
    from yurasic_spider import SongSpider

    settings = Settings()
    settings.setmodule('yurasic_spider.settings', priority='project')
    crawler = CrawlerProcess(settings)
    crawler.crawl(SongSpider)
    crawler.start()
Exemple #22
0
def test_scrapy_spider():
    settings = Settings()
    settings.setmodule("tests.scrapy_spider.settings")
    crawler = Crawler(MySpider, settings=settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.crawl()
    reactor.run()
    stats = crawler.stats.spider_stats["example"]
    assert stats["frontera/crawled_pages_count"] == 5
    assert crawler.spider.callback_calls > 0
Exemple #23
0
    def __init__(self, spider):
        Process.__init__(self)
        setting = Settings()
        setting.setmodule(settings,1)
        self.crawler = Crawler(setting)

        if not hasattr(project, 'crawler'):
            self.crawler.configure()
            self.crawler.signals.connect(reactor.stop, signal = signals.spider_closed)
        self.spider = spider
Exemple #24
0
    def test_update_settings(self):
        spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
        project_settings = {'TEST1': 'project', 'TEST3': 'project'}
        self.spider_class.custom_settings = spider_settings
        settings = Settings(project_settings, priority='project')

        self.spider_class.update_settings(settings)
        self.assertEqual(settings.get('TEST1'), 'spider')
        self.assertEqual(settings.get('TEST2'), 'spider')
        self.assertEqual(settings.get('TEST3'), 'project')
Exemple #25
0
def _construct_scraper_settings():
    """Construct settings for scraper.

    Method constructs settings from default scrapy settings and augments them
    from loaded `CONFIGS`.
    """
    if "scraper" in CONFIGS.keys():
        global SCRAPER_SETTINGS
        SCRAPER_SETTINGS = Settings()
        SCRAPER_SETTINGS.setdict(CONFIGS["scraper"])
 def __init__(self):
     settings = Settings()
     settings.setdict({
         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
         'FEED_FORMAT': 'json',
         'DOWNLOAD_DELAY': 2,
         'REACTOR_THREADPOOL_MAXSIZE': 2,
         'LOG_LEVEL': 'WARNING'
     }, priority='project')
     self.process = CrawlerProcess(settings=settings)
Exemple #27
0
def create_spider():
    custom_settings = Settings()
    custom_settings.setmodule(settings)
    crawler = Crawler(
        spidercls=desy_spider.DesySpider,
        settings=custom_settings,
    )
    return desy_spider.DesySpider.from_crawler(
        crawler,
        source_folder='idontexist_but_it_does_not_matter',
    )
Exemple #28
0
    def start(self):
        settings = Settings()

        # crawl responsibly
        settings.set("USER_AGENT", "test")
        crawler_obj = Spider()
        crawler = Crawler(crawler_obj, settings)

        # stop reactor when spider closes
        crawler.signals.connect(self.stop, signal=signals.spider_closed)
        crawler.crawl()
Exemple #29
0
    def __init__(self, download_func=None, settings=None):
        self.download_func = download_func

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)
        resolve = functools.partial(self._key_for_pipe,
                                    base_class_name="MediaPipeline",
                                    settings=settings)
        self.allow_redirects = settings.getbool(
            resolve('MEDIA_ALLOW_REDIRECTS'), False
        )
        self._handle_statuses(self.allow_redirects)
def get_summaries_collection():
    import pymongo
    from scraper.guardianukscraper import settings
    from scrapy.settings import Settings
    sets = Settings()
    sets.setmodule(settings, priority='project')
    connection = pymongo.MongoClient(
            sets['MONGODB_SERVER'],
            sets['MONGODB_PORT']
    )
    db = connection[sets['MONGODB_DB']]
    return db[sets['MONGODB_COLLECTION']]
 def setUp(self):
     s3reqh = S3DownloadHandler(Settings(), self.AWS_ACCESS_KEY_ID, \
             self.AWS_SECRET_ACCESS_KEY, \
             httpdownloadhandler=HttpDownloadHandlerMock)
     self.download_request = s3reqh.download_request
     self.spider = Spider('foo')
Exemple #32
0
from datetime import datetime
from os.path import dirname, join

import pytest
from city_scrapers_core.constants import COMMISSION, PASSED
from city_scrapers_core.utils import file_response
from freezegun import freeze_time
from scrapy.settings import Settings

from city_scrapers.spiders.chi_board_elections import ChiBoardElectionsSpider

test_response = file_response(
    join(dirname(__file__), "files", "chi_board_elections.html"),
    url='https://app.chicagoelections.com/pages/en/board-meetings.aspx')
spider = ChiBoardElectionsSpider()
spider.settings = Settings(values={"CITY_SCRAPERS_ARCHIVE": False})

freezer = freeze_time('2018-11-30')
freezer.start()

parsed_items = [item for item in spider._next_meeting(test_response)]

freezer.stop()


def test_title():
    assert parsed_items[0]['title'] == 'Electoral Board'


def test_description():
    assert parsed_items[0]['description'] == ''
Exemple #33
0
class FerryServerProtocol(WebSocketServerProtocol):

    _handlers = {
        'load': 'load_page',
        'interact': 'interact_page'
    }
    assets = './'
    settings = Settings()

    @property
    def tab(self):
        return self.factory[self].tab

    @property
    def spider(self):
        return self.factory[self].spider

    @property
    def spiderspec(self):
        return self.factory[self].spiderspec

    @property
    def user(self):
        return self.factory[self]

    def onConnect(self, request):
        try:
            auth_info = json.loads(request.headers['x-auth-info'])
        except (KeyError, TypeError):
            return
        self.session_id = ''
        self.auth_info = auth_info
        self.factory[self] = User(auth_info)

    def onOpen(self):
        if self not in self.factory:
            self.sendClose(1000, 'Invalid Connection missing required '
                                 'parameters')

    def onMessage(self, payload, isbinary):
        close_old_connections()
        payload = payload.decode('utf-8')
        data = json.loads(payload)
        project = data.get('project', data.get('_meta', {}).get('project'))
        storage = create_project_storage(project, author=self.user)
        projects = storage.__class__.get_projects(self.user)
        if project and str(project) not in projects:
            self.sendMessage({'status': 4004, 'message': 'Project not found'})
            return
        deferred = defer.maybeDeferred(
            wrap_callback, None, self._on_message, storage=storage, data=data)
        deferred.addCallbacks(self.sendMessage,
                              partial(self.send_error, data))

    def _on_message(self, storage, data):
        if '_meta' in data and 'session_id' in data['_meta']:
            self.session_id = data['_meta']['session_id']

        if is_blacklisted(data.get('url', '')):
            blacklist_error(data, self)
            return

        command = data['_command']
        command = self._handlers.get(command, command)
        with data_store_context():
            commands = Commands(data, self, storage)
            result = getattr(commands, command, lambda: None)()
        if result:
            result.setdefault('_command', data.get('_callback', command))
            if '_meta' in data and 'id' in data['_meta']:
                result['id'] = data['_meta']['id']
        return result

    def onClose(self, was_clean, code, reason):
        if self in self.factory:
            if self.tab is not None:
                self.tab.close()
                self.tab.network_manager.closed = True
            msg_data = {'session': self.session_id,
                        'session_time': 0,
                        'user': self.user.name}
            msg = (u'Websocket Closed: id=%(session)s t=%(session_time)s '
                   u'user=%(user)s command=' % (msg_data))
            log.err(msg)

    def sendMessage(self, payload, is_binary=False):
        if isinstance(payload, dict) and '_command' in payload:
            super(FerryServerProtocol, self).sendMessage(
                json.dumps(payload, cls=ScrapyJSONEncoder, sort_keys=True),
                is_binary
            )

    def send_error(self, data, failure):
        e = failure.value
        command = data.get('_callback', data.get('_command'))
        id_ = data.get('_meta', {}).get('id')
        if isinstance(e, BaseHTTPError):
            code, reason, message = e.status, e.title, e.body
        elif isinstance(e, KeyError):
            requested_command = data.get('_command')
            code = 4000
            reason = "Unknown command"
            if requested_command:
                message = 'No command named "%s" found.' % requested_command
            else:
                message = "No command received"
        else:
            code = 500
            reason = "Internal Server Error"
            message = "An unexpected error has occurred."
        log.err(failure)
        event_id = getattr(failure, 'sentry_event_id', None)
        if event_id:
            message = "%s (Event ID: %s)" % (message, event_id)

        response = {
            'error': code,
            'reason': reason,
            'message': message,
        }
        if command:
            response['_command'] = command
        if id_:
            response['id'] = id_

        self.sendMessage(response)

    def getElementByNodeId(self, nodeid):
        self.tab.web_page.mainFrame().evaluateJavaScript(
            'livePortiaPage.pyGetByNodeId(%s)' % nodeid
        )
        return self.js_api.getReturnedElement()

    def open_tab(self, meta=None):
        if meta is None:
            meta = {}
        manager = SplashQNetworkAccessManager(
            request_middlewares=[],
            response_middlewares=[],
            verbosity=defaults.VERBOSITY
        )
        manager.setCache(None)

        data = {}
        data['uid'] = id(data)

        self.factory[self].tab = PortiaBrowserTab(
            network_manager=manager,
            splash_proxy_factory=None,
            verbosity=defaults.VERBOSITY,
            render_options=RenderOptions(data, defaults.MAX_TIMEOUT),
            visible=True,
        )
        manager.tab = self.tab
        self.tab.register_callback('on_request', self._configure_requests)
        self.tab.register_callback('on_response', self._set_tab_html)
        main_frame = self.tab.web_page.mainFrame()
        cookiejar = PortiaCookieJar(self.tab.web_page, self)
        manager.cookiejar = cookiejar
        manager.setCookieJar(cookiejar)
        if meta.get('cookies'):
            cookiejar.put_client_cookies(meta['cookies'])

        main_frame.loadStarted.connect(self._on_load_started)
        main_frame.loadFinished.connect(self._on_load_finished)

        self.js_api = PortiaJSApi(self)
        main_frame.javaScriptWindowObjectCleared.connect(
            self._on_javascript_window_cleared)
        main_frame.initialLayoutCompleted.connect(self._on_layout_completed)

        self.tab.set_images_enabled(True)
        self.tab.set_viewport(meta.get('viewport') or _DEFAULT_VIEWPORT)
        self.tab.set_user_agent(meta.get('user_agent') or _DEFAULT_USER_AGENT)
        self.tab.loaded = False

    def _on_load_started(self):
        self.load_id = short_guid()
        self.sendMessage({'_command': 'loadStarted', 'id': self.load_id,
                          'url': self.tab.url})
        self.tab.initial_layout_completed = False

    def _on_load_finished(self):
        if getattr(self.tab, '_raw_url', None) != self.tab.url:
            page = self.tab.web_page
            page.triggerAction(page.ReloadAndBypassCache, False)
        self.sendMessage({'_command': 'loadFinished', 'url': self.tab.url,
                          'id': getattr(self, 'load_id', None)})

    def _configure_requests(self, request, operation, data):
        if request.hasRawHeader('Accept'):
            url = six.binary_type(request.url().toEncoded())
            url_path = urlparse(url).path.lower()
            accepts = str(request.rawHeader('Accept')).lower()
            if (accepts.startswith(STORED_TYPES) or _is_xml(accepts) or
                    url_path.endswith(STORED_EXTENSIONS)):
                request.track_response_body = True
            elif (accepts.startswith(IGNORED_TYPES) or
                  url_path.endswith(MEDIA_EXTENSIONS)):
                drop_request(request)

    def _set_tab_html(self, reply, har, content):
        url = decode(reply.url().toString())
        if content is not None and url == self.tab.url:
            self.tab._raw_html = decode(content)
            self.tab._raw_url = url

    def _on_layout_completed(self):
        if not getattr(self.tab, 'initial_layout_completed', False):
            self.populate_window_object()
            self.tab.initial_layout_completed = True

    def _on_javascript_window_cleared(self):
        if getattr(self.tab, 'initial_layout_completed', False):
            self.populate_window_object()

    def populate_window_object(self):
        main_frame = self.tab.web_page.mainFrame()
        main_frame.addToJavaScriptWindowObject('__portiaApi', self.js_api)
        self.tab.run_js_files(
            os.path.join(self.assets, 'splash_content_scripts'),
            handle_errors=False)

    def open_spider(self, meta, storage=None, project=None):
        if not (meta.get('project') and meta.get('spider')):
            return {'error': 4005, 'reason': 'No project specified'}

        if (self.user.authorized_projects is not None and
                meta['project'] not in self.user.authorized_projects and
                not self.user.staff):
            return {'error': 4004,
                    'reason': 'Project "%s" not found' % meta['project']}
        spider_name = meta['spider']

        if project is None:
            project = Project(storage, id=meta.get('project'))

        try:
            spider_model = project.spiders[spider_name]
        except (IOError, KeyError):
            return {'error': 4004,
                    'reason': 'Spider "%s" not found' % spider_name}
        spider_name, spider, items, extractors = load_spider_data(spider_model)
        if not self.settings.get('SPLASH_URL'):
            self.settings.set('SPLASH_URL', 'portia')
        self.factory[self].spider = IblSpider(spider_name, spider, items,
                                              extractors, self.settings)
        self.factory[self].spiderspec = SpiderSpec(
            project, spider_name, spider, items, extractors)

    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__, str(self))

    def __str__(self):
        tab, spider, spec = '', '', ''
        if self.tab:
            try:
                tab = '{}({})'.format(
                    self.tab.__class__.__name__, self.tab.url)
            except RuntimeError:
                tab = 'MISSING'
        if self.spider:
            spider = '{}({})'.format(
                self.spider.__class__.__name__, self.spider.name)
        if self.spiderspec:
            spec = str(self.spiderspec)
        return ', '.join(filter(bool, (tab, spider, spec)))
Exemple #34
0
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

from les4.les.gb_parse.spiders.autoyoula import AutoyoulaSpider

if __name__ == '__main__':
    crawler_settings = Settings()
    crawler_settings.setmodule('gb_parse.settings')
    crawler_process = CrawlerProcess(settings=crawler_settings)
    crawler_process.crawl(AutoyoulaSpider)
    crawler_process.start()
Exemple #35
0
 def setUp(self):
     self.crawler = Crawler(DefaultSpider, Settings())
Exemple #36
0
class FerryServerProtocol(WebSocketServerProtocol):

    _handlers = {
        'load': load_page,
        'interact': interact_page,
        'close_tab': close_tab,
        'heartbeat': lambda d, s: None,
        'resize': resize,
        'resolve': resolve,
        'extract_items': extract_items,
        'save_html': save_html,
        'update_spider': update_spider
    }
    assets = './'
    settings = Settings()

    @property
    def tab(self):
        return self.factory[self].tab

    @property
    def spider(self):
        return self.factory[self].spider

    @property
    def spiderspec(self):
        return self.factory[self].spiderspec

    @property
    def user(self):
        return self.factory[self]

    def onConnect(self, request):
        try:
            auth_info = json.loads(request.headers['x-auth-info'])
        except (KeyError, TypeError):
            return
        self.session_id = ''
        self.auth_info = auth_info
        self.factory[self] = User(auth_info)

    def onOpen(self):
        if self not in self.factory:
            self.sendClose(1000, 'Invalid Connection missing required '
                                 'parameters')

    def onMessage(self, payload, isbinary):
        close_old_connections()
        pool = getattr(Repoman, 'pool', None)
        payload = payload.decode('utf-8')
        data = json.loads(payload)
        project = data.get('project', data.get('_meta', {}).get('project'))
        self.storage = create_project_storage(project, author=self.user)
        projects = self.storage.__class__.get_projects(self.user)
        if project and str(project) not in projects:
            self.sendMessage({'status': 4004, 'message': 'Project not found'})
            return
        if pool is not None:
            deferred = defer.maybeDeferred(
                pool.run_deferred_with_connection, wrap_callback,
                self._on_message, self.storage, data=data)
        else:
            deferred = defer.maybeDeferred(
                wrap_callback, None, self._on_message, self.storage, data=data)
        deferred.addCallbacks(self.sendMessage,
                              partial(self.send_error, data))

    def _on_message(self, data):
        if '_meta' in data and 'session_id' in data['_meta']:
            self.session_id = data['_meta']['session_id']

        if is_blacklisted(data.get('url', ''), self.settings):
            blacklist_error(data, self)
            return

        command = data['_command']
        with data_store_context():
            result = self._handlers[command](data, self)
        if result:
            result.setdefault('_command', data.get('_callback', command))
            if '_meta' in data and 'id' in data['_meta']:
                result['id'] = data['_meta']['id']
        return result

    def onClose(self, was_clean, code, reason):
        if self in self.factory:
            if self.tab is not None:
                self.tab.close()
                self.tab.network_manager.closed = True
            msg_data = {'session': self.session_id,
                        'session_time': 0,
                        'user': self.user.name}
            msg = (u'Websocket Closed: id=%(session)s t=%(session_time)s '
                   u'user=%(user)s command=' % (msg_data))
            log.err(msg)

    def sendMessage(self, payload, is_binary=False):
        if isinstance(payload, dict) and '_command' in payload:
            super(FerryServerProtocol, self).sendMessage(
                json.dumps(payload, cls=ScrapyJSONEncoder, sort_keys=True),
                is_binary
            )
        self.factory[self].spider, self.storage = None, None

    def send_error(self, data, failure):
        e = failure.value
        command = data.get('_callback', data.get('_command'))
        id_ = data.get('_meta', {}).get('id')
        if isinstance(e, BaseHTTPError):
            code, reason, message = e.status, e.title, e.body
        elif isinstance(e, KeyError):
            requested_command = data.get('_command')
            code = 4000
            reason = "Unknown command"
            if requested_command:
                message = 'No command named "%s" found.' % requested_command
            else:
                message = "No command received"
        else:
            code = 500
            reason = "Internal Server Error"
            message = "An unexpected error has occurred."
        log.err(failure)
        event_id = getattr(failure, 'sentry_event_id', None)
        if event_id:
            message = "%s (Event ID: %s)" % (message, event_id)

        response = {
            'error': code,
            'reason': reason,
            'message': message,
        }
        if command:
            response['_command'] = command
        if id_:
            response['id'] = id_

        self.sendMessage(response)

    def getElementByNodeId(self, nodeid):
        self.tab.web_page.mainFrame().evaluateJavaScript(
            'livePortiaPage.pyGetByNodeId(%s)' % nodeid
        )
        return self.js_api.getReturnedElement()

    def open_tab(self, meta=None):
        if meta is None:
            meta = {}
        manager = PortiaNetworkManager(
            request_middlewares=[],
            response_middlewares=[],
            verbosity=defaults.VERBOSITY
        )
        manager.setCache(None)

        data = {}
        data['uid'] = id(data)

        self.factory[self].tab = BrowserTab(
            network_manager=manager,
            splash_proxy_factory=None,
            verbosity=0,
            render_options=RenderOptions(data, defaults.MAX_TIMEOUT),
            visible=True,
        )
        manager.tab = self.tab
        main_frame = self.tab.web_page.mainFrame()
        cookiejar = PortiaCookieJar(self.tab.web_page, self)
        manager.cookiejar = cookiejar
        manager.setCookieJar(cookiejar)
        if meta.get('cookies'):
            cookiejar.put_client_cookies(meta['cookies'])

        main_frame.loadStarted.connect(self._on_load_started)
        self.js_api = PortiaJSApi(self)
        main_frame.javaScriptWindowObjectCleared.connect(
            self.populate_window_object
        )

        self.tab.set_images_enabled(True)
        self.tab.set_viewport(meta.get('viewport') or _DEFAULT_VIEWPORT)
        self.tab.set_user_agent(meta.get('user_agent') or _DEFAULT_USER_AGENT)
        self.tab.loaded = False

    def _on_load_started(self):
        self.sendMessage({'_command': 'loadStarted'})

    def populate_window_object(self):
        main_frame = self.tab.web_page.mainFrame()
        main_frame.addToJavaScriptWindowObject('__portiaApi', self.js_api)
        self.tab.run_js_files(
            os.path.join(self.assets, 'splash_content_scripts'),
            handle_errors=False)

    def open_spider(self, meta, project=None):
        if not (meta.get('project') and meta.get('spider')):
            return {'error': 4005, 'reason': 'No project specified'}

        if (self.user.authorized_projects is not None and
                meta['project'] not in self.user.authorized_projects and
                not self.user.staff):
            return {'error': 4004,
                    'reason': 'Project "%s" not found' % meta['project']}
        spider_name = meta['spider']

        # project_meta = meta.get('project')
        # project_id = (project_meta if isinstance(project_meta, six.string_types)
        #               else project_meta.id)
        # project = Project(self.storage, id=project_id)

        if project is None:
            project = Project(self.storage, id=meta.get('project'))

        try:
            spider_model = project.spiders[spider_name]
        except IOError:
            return {'error': 4003,
                    'reason': 'Spider "%s" not found' % spider_name}
        spider_name, spider, items, extractors = load_spider_data(spider_model)
        if not self.settings.get('SPLASH_URL'):
            self.settings.set('SPLASH_URL', 'portia')
        self.factory[self].spider = IblSpider(spider_name, spider, items,
                                              extractors, self.settings)
        self.factory[self].spiderspec = SpiderSpec(
            project, spider_name, spider, items, extractors)

    def update_spider(self, meta, spider=None, template=None, items=None,
                      extractors=None):
        if not hasattr(self.factory[self], 'spiderspec'):
            return self.open_spider(meta)
        spec = self.factory[self].spiderspec
        if spec is None or spec.name != meta.get('spider'):
            return self.open_spider(meta)
        items = items or spec.items
        extractors = extractors or spec.extractors
        if spider:
            spider['templates'] = spec.spider['templates']
        else:
            spider = spec.spider
        if template:
            for idx, tmpl in enumerate(spider['templates']):
                if template['original_body'] == tmpl['original_body']:
                    spider['templates'][idx] = template
                    break
            else:
                spider['templates'].append(template)
        self.factory[self].spider = IblSpider(meta['spider'], spider, items,
                                              extractors, self.settings)
        self.factory[self].spiderspec = SpiderSpec(
            spec.project, meta['spider'], spider, items, extractors)
Exemple #37
0
 def setUp(self):
     self.spider = Mock()
     self.settings = Settings()
Exemple #38
0
 def test_feed_export_config_implicit_formats(self):
     settings = Settings()
     self.assertEqual(
         {'items_1.json': {'format': 'json'}, 'items_2.xml': {'format': 'xml'}, 'items_3.csv': {'format': 'csv'}},
         feed_process_params_from_cli(settings, ['items_1.json', 'items_2.xml', 'items_3.csv'])
     )
Exemple #39
0
 def test_feed_export_config_mismatch(self):
     settings = Settings()
     self.assertRaises(
         UsageError,
         feed_process_params_from_cli, settings, ['items1.dat', 'items2.dat'], 'noformat'
     )
Exemple #40
0
 def test_feed_export_config_stdout(self):
     settings = Settings()
     self.assertEqual(
         {'stdout:': {'format': 'pickle'}},
         feed_process_params_from_cli(settings, ['-:pickle'])
     )
Exemple #41
0
 def test_feed_export_config_overwrite(self):
     settings = Settings()
     self.assertEqual(
         {'output.json': {'format': 'json', 'overwrite': True}},
         feed_process_params_from_cli(settings, [], None, ['output.json'])
     )
Exemple #42
0
 def setUp(self):
     self.crawler = mock.MagicMock()
     self.crawler.settings = Settings()
     self.crawler.engine.download = mock.MagicMock()
Exemple #43
0
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from edu_parse.spiders.autoyoula import AutoyoulaSpider

if __name__ == '__main__':
    crawler_settings = Settings()
    crawler_settings.setmodule("edu_parse.settings")
    crawler_proc= CrawlerProcess(settings=crawler_settings)
    crawler_proc.crawl(AutoyoulaSpider)
    crawler_proc.start()
    pass
 def setUp(self):
     self.tmpname = self.mktemp()
     with open(self.tmpname + '^', 'w') as f:
         f.write('0123456789')
     self.download_request = FileDownloadHandler(
         Settings()).download_request
def start_crawler(spider, search):
    # Set up spider
    spider = TripAdvisorSpider(search=search)

    # Set up settings
    settings = Settings()
    # settings.overrides['FEED_FORMAT']='csv'
    # settings.overrides['FEED_URI']='tripadvisor_{0}.csv'.format(search)
    settings.set('CLOSESPIDER_ITEMCOUNT', False)
    settings.set('ROBOTSTXT_OBEY', False)
    settings.set('COOKIES_ENABLED', False)
    settings.set(
        'ITEM_PIPELINES',
        {'tripadvisor_scraper.pipelines.TripadvisorScraperPipeline': 300})
    settings.set('DOWNLOAD_DELAY', 3)
    settings.set('LOG_FILENAME', 'log.log')
    # settings.overrides['LOG_FILENAME'] = 'log.log'
    # settings.overrides['ROBOTSTXT_OBEY'] = False # Ignore robots.txt
    # settings.overrides['CLOSESPIDER_ITEMCOUNT']=1
    # settings.overrides['DOWNLOAD_DELAY'] = 3
    # settings.overrides['COOKIES_ENABLED'] = False
    # settings.overrides['ITEM_PIPELINES'] = {
    #    'tripadvisor_scraper.pipelines.TripadvisorScraperPipeline': 300,
    # }

    # Set up crawler
    crawler = Crawler(spider, settings)
    # crawler.configure()
    crawler.signals.connect(spider_closed, signal=signals.spider_closed)
    crawler.crawl(spider)
class StorageTest(TestCase):
    def setUp(self):
        self.spider = Spider('foo')
        self.settings = Settings()
        self.settings.setmodule(default_settings)

    def tearDown(self):
        pass

    def test_environment(self):
        oldenv = os.environ.copy()

        os.environ['http_proxy'] = http_proxy = 'https://proxy.for.http:3128'
        os.environ['https_proxy'] = https_proxy = 'http://proxy.for.https:8080'
        os.environ.pop('file_proxy', None)

        settings = deepcopy(self.settings)
        storage = EnvironmentStorage(settings)

        storage.open_spider(self.spider)

        self.assertTrue(storage, True)

        self.assertIn('http', storage)
        self.assertIn('https', storage)
        self.assertNotIn('file_proxy', storage)
        self.assertSequenceEqual(
            storage['http'],
            get_proxy(http_proxy, 'http', storage.auth_encoding))
        self.assertSequenceEqual(
            storage['https'],
            get_proxy(https_proxy, 'https', storage.auth_encoding))

        storage.close_spider(self.spider)
        os.environ = oldenv

    def test_settings(self):
        http_proxy_1 = 'https://proxy.for.http.1:3128'
        http_proxy_2 = 'https://proxy.for.http.2:3128'
        https_proxy_1 = 'http://proxy.for.https.1:8080'
        https_proxy_2 = 'http://proxy.for.https.2:8080'

        local_settings = {
            'HTTPPROXY_ENABLED': True,
            'HTTPPROXY_PROXIES': {
                'http': [http_proxy_1, http_proxy_2],
                'https': [https_proxy_1, https_proxy_2]
            }
        }
        settings = deepcopy(self.settings)
        settings.setdict(local_settings)
        storage = SettingsStorage(settings)

        storage.open_spider(self.spider)

        self.assertTrue(storage, True)

        self.assertIn('http', storage)
        self.assertIn('https', storage)

        self.assertSequenceEqual(
            storage['http'],
            get_proxy(http_proxy_1, 'http', storage.auth_encoding))
        storage.close_spider(self.spider)
def main():
    global settings
    from scrapy import cmdline
    from scrapy.settings import Settings

    parser = argparse.ArgumentParser(description=__doc__, add_help=False)
    parser.add_argument('-h',
                        '--help',
                        dest='help',
                        help='获取帮助信息',
                        action='store_true',
                        default=False)

    act_group = parser.add_argument_group(title='操作选项组')
    act_group.add_argument('-r',
                           '--run',
                           dest='cmd',
                           help='运行爬虫获取数据',
                           action='store_const',
                           const='runspider')
    act_group.add_argument('-s',
                           '--shell',
                           dest='cmd',
                           help='控制台调试',
                           action='store_const',
                           const='shell')
    act_group.add_argument('-v',
                           '--view',
                           dest='cmd',
                           help='使用浏览器打开蜘蛛获取的URL页面',
                           action='store_const',
                           const='view')

    run_group = parser.add_argument_group(title='运行操作组')
    run_group.add_argument('-n',
                           '--limit-num',
                           dest='limit',
                           default=0,
                           help='限制总请求次数,默认为0不限制',
                           type=int)
    run_group.add_argument('-m',
                           '--max-request-num',
                           dest='max',
                           default=30,
                           help='同时最大请求数,默认为30,0则不限制',
                           type=int)
    run_group.add_argument("-a",
                           dest="spargs",
                           action="append",
                           default=[],
                           metavar="NAME=VALUE",
                           help="设置爬虫参数(可以重复)")
    run_group.add_argument("-o",
                           "--output",
                           metavar="FILE",
                           help="输出 items 结果集 值FILE (使用 -o 将定向至 stdout)")
    run_group.add_argument("-t",
                           "--output-format",
                           metavar="FORMAT",
                           help="基于 -o 选项,使用指定格式输出 items")
    run_group.add_argument('-d',
                           '--dist',
                           help='分布式运行,用于其他进程提交数据',
                           action='store_true',
                           default=False)

    gen_group = parser.add_argument_group(title='通用选择项')
    gen_group.add_argument('-u',
                           '--url',
                           help='设置URL,运行操作设置该项则为起始爬取URL,\
                                                                    调试操作设置则为调试URL,查看操作则为打开查看URL'
                           )

    args = parser.parse_args()
    if args.help:
        parser.print_help()
    elif args.cmd:
        settings = Settings(settings)
        if args.cmd == 'runspider':
            argv = [sys.argv[0], args.cmd, sys.argv[0]]
            for vo in run_group._group_actions:
                opt = vo.option_strings[0]
                val = args.__dict__.get(vo.dest)
                if val == vo.default:
                    continue
                if isinstance(val, (list, tuple)):
                    val = ' '.join(val)
                if vo.dest == 'limit':
                    settings['CLOSESPIDER_ITEMCOUNT'] = val
                    continue
                elif vo.dest == 'max':
                    settings['CONCURRENT_REQUESTS'] = val
                    continue
                elif vo.dest == 'dest':
                    settings['DESTRIBUT_RUN'] = val
                    continue
                argv.extend([opt, val])
            if args.url:
                argv.extend(['-a', 'START_URL=%s' % args.url])
        elif args.cmd == 'shell':
            argv = [sys.argv[0], args.cmd]
            if args.url:
                argv.append(args.url)
        elif args.cmd == 'view':
            if not args.url:
                print('please setting --url option')
                return None
            argv = [sys.argv[0], args.cmd, args.url]
        cmdline.execute(argv, settings)
    else:
        parser.print_usage()
Exemple #48
0
 def test_feed_export_config_invalid_format(self):
     settings = Settings()
     self.assertRaises(UsageError, feed_process_params_from_cli, settings, ['items.dat'], 'noformat')
 def setUp(self):
     self.download_handler = DataURIDownloadHandler(Settings())
     self.download_request = self.download_handler.download_request
     self.spider = Spider('foo')
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from avito_parse import settings
from avito_parse.spiders.avito import AvitoSpider

if __name__ == '__main__':
    scr_settings = Settings()
    scr_settings.setmodule(settings)
    process = CrawlerProcess(settings=scr_settings)
    process.crawl(AvitoSpider)
    process.start()
Exemple #51
0
# import dmoz spider class
from DmozSpider import DmozSpider

# scrapy api
from scrapy import signals, log
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy.settings import Settings

def spider_closing(spider):
    """Activates on spider closed signal"""
    log.msg("Closing reactor", level=log.INFO)
    reactor.stop()

#log.(loglevel=log.DEBUG)
settings = Settings()

# crawl responsibly
settings.set("USER_AGENT", "Kiran Koduru (+http://kirankoduru.github.io)")
crawler = Crawler(DmozSpider(),settings)

# stop reactor when spider closes
crawler.signals.connect(spider_closing, signal=signals.spider_closed)

#crawler.configure()
crawler.crawl()
#crawler.start()
reactor.run()
Exemple #52
0
 def setUp(self):
     self.tempdir = mkdtemp()
     self.pipeline = FilesPipeline.from_settings(
         Settings({'FILES_STORE': self.tempdir}))
     self.pipeline.download_func = _mocked_download_func
     self.pipeline.open_spider(None)
 def setUp(self):
     crawler = get_crawler(Spider)
     self.spider = Spider.from_crawler(crawler, name='foo')
     self.mw = HttpErrorMiddleware(Settings({}))
     self.req = Request('http://scrapytest.org')
     self.res200, self.res404 = _responses(self.req, [200, 404])
Exemple #54
0
 def test_enabled_from_settings(self):
     settings = Settings()
     mwman = TestMiddlewareManager.from_settings(settings)
     classes = [x.__class__ for x in mwman.middlewares]
     self.assertEqual(classes, [M1, M3])
            "//*[contains(@itemprop, 'track')]//@href").extract()
        for link in link_to_lyrics:
            yield Request(url=''.join(('http://www.songteksten.nl', link)),
                          callback=self.parse_lyrics)

    def parse_lyrics(self, response):
        hxs = Selector(response)
        item = LyricsSearchItem()
        item['lyrics'] = hxs.xpath(
            "//*[contains(@itemprop, 'description')]/text()").extract()
        return item


def callback(spider, reason):
    stats = spider.crawler.stats.get_stats()
    reactor.stop()


settings = Settings()
settings.set('ITEM_PIPELINES', {'pipeline.LyricsSearchPipeline': 100})


def crawl():
    crawler = Crawler(settings)
    spider = MySpider()
    crawler.signals.connect(callback, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    reactor.run()
 def setUp(self):
     self.tmpname = self.mktemp()
     fd = open(self.tmpname + '^', 'w')
     fd.write('0123456789')
     fd.close()
     self.download_request = FileDownloadHandler(Settings()).download_request
Exemple #57
0
SUBMIT_TYPES = {'submit button'}
DEFAULT_POST_HEADERS = {b'Content-Type': b'application/x-www-form-urlencoded'}

USER_AGENT = ('Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 '
              '(KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 '
              'Chrome/43.0.2357.130 Safari/537.36')

base_settings = Settings(values=dict(
    TELNETCONSOLE_ENABLED=False,
    ROBOTSTXT_OBEY=False,
    DOWNLOAD_DELAY=2.0,
    DEPTH_PRIORITY=1,
    CONCURRENT_REQUESTS=2,
    CONCURRENT_REQUESTS_PER_DOMAIN=2,
    SCHEDULER_DISK_QUEUE='scrapy.squeues.PickleFifoDiskQueue',
    SCHEDULER_MEMORY_QUEUE='scrapy.squeues.FifoMemoryQueue',
    # DOWNLOADER_MIDDLEWARES are set in get_settings
    USER_AGENT=USER_AGENT,
    DOWNLOADER_MIDDLEWARES={
        'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
        # Placed before splash middleware
        'autologin.middleware.ProxyMiddleware': 720,
    },
))


def crawl_runner(extra_settings=None):
    settings = base_settings.copy()
    if extra_settings is not None:
        settings.update(extra_settings, priority='cmdline')
    if settings.get('SPLASH_URL'):
Exemple #58
0
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

from bookparser import settings
from bookparser.spiders.book24 import Book24Spider
from bookparser.spiders.labirint import LabirintSpider

if __name__ == "__main__":
    crawler_settings = Settings()
    crawler_settings.setmodule(settings)

    process = CrawlerProcess(settings=crawler_settings)
    process.crawl(Book24Spider)
    process.crawl(LabirintSpider)

    process.start()
 def setUp(self):
     self.spider = Spider('foo')
     self.settings = Settings()
     self.settings.setmodule(default_settings)
 def setUp(self):
     self.spider = Spider('foo')
     self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True}))
     self.req = Request('http://scrapytest.org')
     self.res200, self.res404, self.res402 = _responses(
         self.req, [200, 404, 402])