def get_project_settings(): # 指定设定(Designating the settings) # ENVVAR = 'SCRAPY_SETTINGS_MODULE' # 当您使用Scrapy时,您需要声明您所使用的设定。这可以通过使用环境变量: SCRAPY_SETTINGS_MODULE 来完成。 # # SCRAPY_SETTINGS_MODULE 必须以Python路径语法编写, 如 myproject.settings 。 注意,设定模块应该在 Python import search path 中。 if ENVVAR not in os.environ: project = os.environ.get('SCRAPY_PROJECT', 'default') init_env(project) settings_module_path = os.environ.get(ENVVAR) if settings_module_path: settings_module = import_module(settings_module_path) else: settings_module = None settings = CrawlerSettings(settings_module) # XXX: remove this hack pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE") settings.overrides = pickle.loads(pickled_settings) if pickled_settings else {} # XXX: deprecate and remove this functionality for k, v in os.environ.items(): if k.startswith('SCRAPY_'): settings.overrides[k[7:]] = v return settings
def _build_settings(settings=None): if settings is None: settings = CrawlerSettings() elif isinstance(settings, dict): values = settings settings = CrawlerSettings() settings.defaults.update(values) return settings
def __init__(self, start_url_loader, loger, setting_module_path = 'le_crawler.common.headline_album_settings', *kargs, **kwargs): __import__(setting_module_path) self.__settings = CrawlerSettings(settings_module = sys.modules[setting_module_path]) self.loger = loger # {album_id, {}} self.album_ids = {} self.__init_regs() self.__extend_map_handler = kwargs['extend_map_handler'] if \ kwargs.has_key('extend_map_handler') else None if kwargs.has_key('extract_setting'): self.__extend_map_handler = \ ExtendMapHandler.get_instance(start_url_loader, kwargs['extract_setting']) else: self.__extend_map_handler =\ ExtendMapHandler.get_instance(start_url_loader) self.__url_normalize = UrlNormalize.get_instance() self.album_infos = {} self.url_filter = UrlFilter().get_instance()
def setUp(self): """Initialize the test.""" crawler = Crawler(CrawlerSettings(settings)) crawler.configure() self.spider = ebird_spider.EBirdSpider('REG') self.spider.set_crawler(crawler) self.requests = self.spider.start_requests()
def create_root(config): from scrapy import log from scrapy.settings import CrawlerSettings from slyd.crawlerspec import (CrawlerSpecManager, create_crawler_spec_resource) from slyd.bot import create_bot_resource import slyd.settings from slyd.projects import ProjectsResource root = Resource() root.putChild("static", File(config['docroot'])) crawler_settings = CrawlerSettings(settings_module=slyd.settings) spec_manager = CrawlerSpecManager(crawler_settings) # add project management at /projects projects = ProjectsResource(crawler_settings) root.putChild('projects', projects) # add crawler at /projects/PROJECT_ID/bot log.msg("Slybot specs loading from %s/[PROJECT]" % spec_manager.basedir, level=log.DEBUG) projects.putChild("bot", create_bot_resource(spec_manager)) # add spec at /projects/PROJECT_ID/spec spec = create_crawler_spec_resource(spec_manager) projects.putChild("spec", spec) return root
def run(self): feconfig = self.configdata[const.FE_CONFIG] try: #======================================================================= # if the city use the default config #======================================================================= city_config = eval(feconfig[self.city_name]) except Exception: city_config = {} start_page = city_config.get(const.START_PAGE, feconfig[const.DEFAULT_START_PAGE]) end_page = city_config.get(const.END_PAGE, feconfig[const.DEFAULT_END_PAGE]) values = { const.CONFIG_DATA: self.configdata, const.START_PAGE: int(start_page), const.END_PAGE: int(end_page), } settings = u'crawler.shc.fe.settings' module_import = __import__(settings, {}, {}, ['']) settings = CrawlerSettings(module_import, values=values) execute(argv=["scrapy", "crawl", 'SHCSpider'], settings=settings)
def run_spider(spider, settings): """Run a spider with given settings""" from scrapy import signals from scrapy.xlib.pydispatch import dispatcher from scrapy.settings import CrawlerSettings def catch_item(sender, item, **kwargs): #log.msg("Got:" + str(item)) pass dispatcher.connect(catch_item, signal=signals.item_passed) """clean storage""" scraperwiki.sqlite.execute("drop table if exists " + spider.name) scraperwiki.sqlite.commit() from scrapy.crawler import CrawlerProcess settings = CrawlerSettings(values=settings) crawler = CrawlerProcess(settings) crawler.install() crawler.configure() crawler.crawl(spider) #log.start(loglevel='DEBUG') crawler.start()
def setUp(self): """Initialize the test.""" settings.LOG_LEVEL = 'DEBUG' crawler = Crawler(CrawlerSettings(settings)) crawler.configure() self.spider = ebird_spider.EBirdSpider('REG') self.spider.set_crawler(crawler)
def valid_proxy(): module_ = __import__('crawler.httpproxy.settings', {}, {}, ['']) values = { u'RETRY_ENABLED': 0, u'DOWNLOAD_TIMEOUT': 2, } settings = CrawlerSettings(module_, values=values) execute(argv=["scrapy", "crawl", "BaiDuHomePageSpider"], settings=settings)
def main(): """Setups item signal and run the spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): try: print( """\ <item> <title>#{number}: {title}</title> <link>{link}</link> <description>{description}</description> <pubDate>{pubdate}</pubDate> <guid>{audio_url}</guid> <enclosure url="{audio_url}" length="0" type="audio/mpeg" /> </item>\ """.format(**item)) except: print 'ERROR', item # shut off log settings = CrawlerSettings() settings.overrides['LOG_ENABLED'] = False # set up crawler crawler = Crawler(settings) crawler.signals.connect(catch_item, signal=signals.item_passed) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() # schedule spider crawler.crawl(TALSpider()) # print header with open('header.xml') as f: print f.read() # start engine scrapy/twisted crawler.start() reactor.run() # print footer with open('footer.xml') as f: print f.read()
def main(): """Setups item signal and run the spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): try: print("""\ <item> <title>#{number}: {title}</title> <link>{link}</link> <description>{description}</description> <pubDate>{pubdate}</pubDate> <guid>{audio_url}</guid> <enclosure url="{audio_url}" length="0" type="audio/mpeg" /> </item>\ """.format(**item)) except: print 'ERROR', item # shut off log settings = CrawlerSettings() settings.overrides['LOG_ENABLED'] = False # set up crawler crawler = Crawler(settings) crawler.signals.connect(catch_item, signal=signals.item_passed) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() # schedule spider crawler.crawl(TALSpider()) # print header with open('header.xml') as f: print f.read() # start engine scrapy/twisted crawler.start() reactor.run() # print footer with open('footer.xml') as f: print f.read()
def setUp(self): """Initialize the test.""" crawler = Crawler(CrawlerSettings(settings)) crawler.configure() self.spider = ebird_spider.EBirdSpider('REG') self.spider.set_crawler(crawler) self.spider.start_requests() self.records = [{ 'checklistID': 'CL00001', 'comName': 'Common Name', 'countryCode': 'CC', 'countryName': 'Country', 'firstName': 'Name', 'howMany': 1, 'lastName': 'Surname', 'lat': 45.000000, 'lng': -45.000000, 'locID': 'L0000001', 'locName': 'Location 1', 'locationPrivate': True, 'obsDt': '2013-03-27 09:00', 'obsID': 'OBS0000001', 'obsReviewed': False, 'obsValid': True, 'presenceNoted': False, 'sciName': 'Scientific Name', 'subID': 'S0000001', 'subnational1Code': 'SN-01', 'subnational1Name': 'Region', 'subnational2Code': 'SN-02', 'subnational2Name': 'County', }, { 'checklistID': 'CL00002', 'comName': 'Common Name', 'countryCode': 'CC', 'countryName': 'Country', 'firstName': 'Name', 'howMany': 1, 'lastName': 'Surname', 'lat': 50.000000, 'lng': -50.000000, 'locID': 'L0000002', 'locName': 'Location 2', 'locationPrivate': True, 'obsDt': '2013-03-27 10:00', 'obsID': 'OBS0000002', 'obsReviewed': False, 'obsValid': True, 'presenceNoted': False, 'sciName': 'Scientific Name', 'subID': 'S0000002', 'subnational1Code': 'SN-01', 'subnational1Name': 'Region', 'subnational2Code': 'SN-02', 'subnational2Name': 'County', }]
def fetch_proxy(): module_ = __import__('crawler.httpproxy.settings', {}, {}, ['']) values = { u'DOWNLOAD_DELAY': 0, u'DOWNLOAD_TIMEOUT': 1, u'RETRY_ENABLED': 0 } settings = CrawlerSettings(module_, values=values) execute(argv=["scrapy", "crawl", "FiveOneNewHTTPProxySpider"], settings=settings)
def __init__(self, spider, results): Process.__init__(self) self.results = results settings_module = importlib.import_module('Extractors.HTMLScraper.settings') settings = CrawlerSettings(settings_module) self.crawlerProcess = CrawlerProcess(settings) self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed)
def get_project_settings(): if ENVVAR not in os.environ: project = os.environ.get("SCRAPY_PROJECT", "default") init_env(project) settings_module_path = os.environ.get(ENVVAR, "scrapy_settings") try: settings_module = __import__(settings_module_path, {}, {}, [""]) except ImportError: settings_module = None settings = CrawlerSettings(settings_module) # XXX: remove this hack pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE") settings.overrides = pickle.loads(pickled_settings) if pickled_settings else {} # XXX: deprecate and remove this functionality for k, v in os.environ.items(): if k.startswith("SCRAPY_"): settings.overrides[k[7:]] = v return settings
def test_skip_parsing_webpages(self): """Verify no web requests are made if include_html is False.""" crawler = Crawler(CrawlerSettings(settings)) crawler.configure() spider = ebird_spider.EBirdSpider('REG') spider.set_crawler(crawler) spider.start_requests() spider.include_html = False response = response_for_data(self.records) results = spider.parse_locations(response) self.assertEqual(0, sum(1 for _ in results))
def get_project_settings(): if ENVVAR not in os.environ: project = os.environ.get('SCRAPY_PROJECT', 'default') init_env(project) settings_module_path = os.environ.get(ENVVAR) if settings_module_path: settings_module = import_module(settings_module_path) else: settings_module = None settings = CrawlerSettings(settings_module) # XXX: remove this hack pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE") settings.overrides = pickle.loads(pickled_settings) if pickled_settings else {} # XXX: deprecate and remove this functionality for k, v in os.environ.items(): if k.startswith('SCRAPY_'): settings.overrides[k[7:]] = v return settings
def get_crawler(settings_dict=None): """Return an unconfigured Crawler object. If settings_dict is given, it will be used as the settings present in the settings module of the CrawlerSettings. """ class SettingsModuleMock(object): pass settings_module = SettingsModuleMock() if settings_dict: for k, v in settings_dict.items(): setattr(settings_module, k, v) settings = CrawlerSettings(settings_module) return Crawler(settings)
def get_project_settings(): if ENVVAR not in os.environ: project = os.environ.get('SCRAPY_PROJECT', 'default') init_env(project) settings_module_path = os.environ.get(ENVVAR, 'scrapy_settings') try: settings_module = __import__(settings_module_path, {}, {}, ['']) except ImportError: settings_module = None settings = CrawlerSettings(settings_module) # XXX: remove this hack pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE") settings.overrides = pickle.loads( pickled_settings) if pickled_settings else {} # XXX: deprecate and remove this functionality for k, v in os.environ.items(): if k.startswith('SCRAPY_'): settings.overrides[k[7:]] = v return settings
def crawl_spider(domain, day1, day2): spider_dict ={'agoda.com': AgodaSpider, 'ivivu.com': IvivuSpider} args = {'from_date': datetime.now() + timedelta(days=day1), 'to_date' : datetime.now() + timedelta(days=day2) } print "\n crawl spider===========" spider = spider_dict.get(domain, AgodaSpider) spider = spider(args) settings_module = import_module('scraper.scraper.settings') settings = CrawlerSettings(settings_module) settings.overrides['SPIDER_MODULES'] = ['scraper.scraper.spiders'] # settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run()
def _runCrawler(spider, results): settings_module = importlib.import_module( 'Extractors.HTMLScraper.settings') settings = CrawlerSettings(settings_module) crawlerProcess = CrawlerProcess(settings) items = [] def _item_passed(item, response, spider): items.append(item) dispatcher.connect(_item_passed, signals.item_scraped) crawler = crawlerProcess.create_crawler("currentCrawler") crawler.crawl(spider) crawlerProcess.start() crawlerProcess.stop() results.put(items)
def test(self): crawler = mock.MagicMock() crawler.settings = CrawlerSettings() crawler.settings.overrides['USER_AGENT'] = 'CustomAgent' self.assertRaises(NotConfigured, RobotsTxtMiddleware, crawler) crawler.settings.overrides['ROBOTSTXT_OBEY'] = True crawler.engine.download = mock.MagicMock() ROBOTS = re.sub( r'^\s+(?m)', '', ''' User-Agent: * Disallow: /admin/ Disallow: /static/ ''') response = Response('http://site.local/robots.txt', body=ROBOTS) def return_response(request, spider): deferred = Deferred() reactor.callFromThread(deferred.callback, response) return deferred crawler.engine.download.side_effect = return_response middleware = RobotsTxtMiddleware(crawler) spider = None # not actually used # There is a bit of neglect in robotstxt.py: robots.txt is fetched asynchronously, # and it is actually fetched only *after* first process_request completes. # So, first process_request will always succeed. # We defer test() because otherwise robots.txt download mock will be called after assertRaises failure. self.assertIsNone( middleware.process_request(Request('http://site.local'), spider)) # not affected by robots.txt def test(r): self.assertIsNone( middleware.process_request( Request('http://site.local/allowed'), spider)) self.assertRaises(IgnoreRequest, middleware.process_request, Request('http://site.local/admin/main'), spider) self.assertRaises(IgnoreRequest, middleware.process_request, Request('http://site.local/static/'), spider) deferred = Deferred() deferred.addCallback(test) reactor.callFromThread(deferred.callback, None) return deferred
def run(self): if self.proxies: values = configdata.get(const.vpsettings, {}) values[AppConst.proxies] = self.proxies values[const.DOWNLOAD_TIMEOUT] = int( values.get(const.DOWNLOAD_TIMEOUT, 5)) if const.Console in values: if values[const.Console] == u'1': # out to console values[const.LOG_FILE] = None else: log_dir = values.get(const.LOG_DIR, os.getcwd()) if const.LOG_FILE in values: logfile_prefix = datetime.datetime.now().strftime( "%Y%m%d_%H%M%S_%f") log_file = '%s_%s' % (logfile_prefix, values[const.LOG_FILE]) values[const.LOG_FILE] = os.sep.join( [log_dir, log_file]) settings = CrawlerSettings(None, values=values) execute(argv=["scrapy", "crawl", 'SOSOSpider'], settings=settings)
def run_spider(spider, settings): """Run a spider with given settings""" from scrapy import signals from scrapy.xlib.pydispatch import dispatcher from scrapy.settings import CrawlerSettings def catch_item(sender, item, **kwargs): #log.msg("Got:" + str(item)) pass dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.crawler import CrawlerProcess settings = CrawlerSettings(values=settings) crawler = CrawlerProcess(settings) crawler.crawl(spider) #log.start(loglevel='DEBUG') crawler.start()
import sys import tempfile from scrapy.settings import CrawlerSettings from checklists_scrapers import settings from checklists_scrapers.spiders.worldbirds_spider import WorldBirdsSpider from checklists_scrapers.tests.utils import RunCrawler from checklists_scrapers.utils import list_files from checklists_scrapers.tests.validation import checklists settings.DOWNLOAD_DIR = tempfile.mkdtemp() settings.REPORT_RECIPIENTS = '' username = sys.argv[1] password = sys.argv[2] country = sys.argv[3] spider = WorldBirdsSpider(username=username, password=password, country=country) RunCrawler(CrawlerSettings(settings)).crawl(spider) for path in list_files(settings.DOWNLOAD_DIR, 'json'): with open(path, 'rb') as fp: checklists.append(json.load(fp)) nose.run(argv=['checklists_scrapers.tests.validation']) shutil.rmtree(settings.DOWNLOAD_DIR)
def test_spec_manager(): """Create a CrawlerSpecManager configured to use test settings""" crawler_settings = CrawlerSettings(settings_module=test_settings) return SpecManager(crawler_settings)
def __contains__(self, item): #print(type(item)) return True if CrawlerSettings.__getitem__(self, item) else False
class HeadLineAlbumExtractor(object): _instance = None _instance_lock = threading.Lock() @staticmethod def get_instance( start_url_loader, setting_module_path = 'le_crawler.common.headline_album_settings', *kargs, **kwargs): HeadLineAlbumExtractor._instance_lock.acquire() if not HeadLineAlbumExtractor._instance: loger = Log('album_crawler', '../log/album_crawler.log') HeadLineAlbumExtractor._instance = \ HeadLineAlbumExtractor(start_url_loader, loger, setting_module_path, *kargs, **kwargs) HeadLineAlbumExtractor._instance_lock.release() return HeadLineAlbumExtractor._instance def __init__(self, start_url_loader, loger, setting_module_path = 'le_crawler.common.headline_album_settings', *kargs, **kwargs): __import__(setting_module_path) self.__settings = CrawlerSettings(settings_module = sys.modules[setting_module_path]) self.loger = loger # {album_id, {}} self.album_ids = {} self.__init_regs() self.__extend_map_handler = kwargs['extend_map_handler'] if \ kwargs.has_key('extend_map_handler') else None if kwargs.has_key('extract_setting'): self.__extend_map_handler = \ ExtendMapHandler.get_instance(start_url_loader, kwargs['extract_setting']) else: self.__extend_map_handler =\ ExtendMapHandler.get_instance(start_url_loader) self.__url_normalize = UrlNormalize.get_instance() self.album_infos = {} self.url_filter = UrlFilter().get_instance() def get_category_id(self, refer_url): ca = self.__get_category_name(refer_url) if 'joke' == ca: return 109 elif 'ent' == ca: return 104 else: return -1 def __get_category_name(self, refer_url): return self.__extend_map_handler.settings.get_category_name(refer_url)\ if self.__extend_map_handler else 'UNKONWN_CATEGORY' def __init_regs(self): # load glob id url reg self.local_id_reg = [] for r in self.__settings.getlist('LOCAL_ID_REG', []): self.local_id_reg.append(re.compile(r, re.I | re.S)) # load album id url self.album_id_match_regs = {} for k, v in self.__settings.getdict('ALBUM_ID_URL', {}).items(): for r in v: self.album_id_match_regs.setdefault(k, []).append(re.compile(r, re.I | re.S)) # load extend url dict self.extend_album_pages = self.__settings.getdict('ALBUM_PAGE_URL', {}) # load global album id reg self.global_albumid_reg = self.__settings.getdict('GLOBAL_ALBUMID_REG', {}) # href tags self.href_tags = self.__settings.getlist('HREF_TAGs', []) self.loger.log.info('load href url tags: %s' % len(self.href_tags)) def __get_global_albumid(self, localid, site_album_id): url = self.global_albumid_reg[localid].replace('(*albumid*)', site_album_id) from le_crawler.core.docid_generator import gen_docid return gen_docid(url) def __get_localid(self, url): return query_domain_from_url(url) or self.url_filter.get_domain_from_url(url) # deprecated: using above #def __get_localid(self, url): # for r in self.local_id_reg: # sg = r.search(url) # if not sg: # continue # g = sg.groups() # if g: # return g[0] # return None def __parser_urls(self, sels): returls = [] for urls in sels.xpath('//a'): for attr in self.href_tags: u = urls.xpath('./@%s' % attr) if u: returls.append(u.extract()[0].encode('utf8')) return returls # return [idlist] def __get_site_album_id(self, localid, urls): retlist = set() for u in urls: for r in self.album_id_match_regs[localid]: sg = r.search(u) if sg: g = sg.groups() if g: retlist.add(g[0]) break return list(retlist) # return album video urls def __get_album_pags(self, localid, idlist, refer_url): returls = [] category = self.__get_category_name(refer_url) postfix = ' %s|channel' % category if self.extend_album_pages.has_key(localid): for pageurl in self.extend_album_pages[localid]: for id in idlist: glid = self.__get_global_albumid(localid, id) strtmp = pageurl.replace('(*albumid*)', id).replace('(*pagenum*)', '(*)') + postfix from le_crawler.base.url_extend import extend_url sta, extedurls = extend_url(strtmp, '1', '8', 0) if not sta: continue for eu in extedurls: # preprocess url self.album_ids[eu.split(' ')[0]] = glid returls.extend(extedurls) return returls def get_global_albumid_by_refer(self, refer_url): if self.album_ids.has_key(refer_url): return self.album_ids[refer_url] else: print 'Error: can not found global id:', refer_url # return description # dict = {'enter_page': [], 'album_pages':[], 'album_infos_pages' : []} def parser_enter(self, url, pages): localid = self.__get_localid(url) if not localid: return [] from scrapy.selector import Selector sel = Selector(text = pages, type = 'html') if not sel: return [] urls = self.__parser_urls(sel) albumids = self.__get_site_album_id(localid, urls) albumurls = self.__get_album_pags(localid, albumids, url) # hock start urls to extend_map_handler self.__extend_map_handler.settings.add_start_urls(albumurls) return albumurls # input enter page # return album info pages def parser_album_info_pages(self, body, url, refer_url): sta = False albumid = self.get_global_albumid_by_refer(url) if not self.album_infos.has_key(albumid): sta, items = self.__extend_map_handler.settings.extract_custom_map( body = body, pageurl = url) if not sta: return [] cateid = self.get_category_id(refer_url) self.album_infos[albumid] = items self.album_infos[albumid]['album_cid'] = cateid self.album_infos[albumid]['album_id'] = albumid self.album_infos[albumid]['album_url'] = \ self.__url_normalize.get_unique_url(url) # second extract urls status, extend_url = self.__extend_map_handler.extract_extend_map(body = body, pageurl = url, ignore_empty_property = True) if status: ldict = self.__extend_map_handler.get_inlink_location_dict() if not ldict.has_key(extend_url[0]): self.loger.log.error('Failed found inlink location for %s' % extend_url[0]) assert False, 'Failed found inlink location, %s' % extend_url[0] else: locationstr = ldict[extend_url[0]] self.album_infos[albumid].setdefault('album_vids', {})[locationstr] = \ [self._get_store_key(i) for i in extend_url] video_url = extend_url[0] if extend_url else None album_pic = self.__extend_map_handler.lookup_extend_map(video_url, type = 'dict')['cover']\ if video_url and \ self.__extend_map_handler.lookup_extend_map(video_url, type = 'dict')\ and self.__extend_map_handler.lookup_extend_map(video_url, type = 'dict').has_key('cover') else None if album_pic: self.album_infos[albumid]['album_pic'] = album_pic return extend_url #Note: this docid should same as today_tv_writer def _get_store_key(self, url): return md5.new(url).hexdigest() def parser_enter_page(self, url, sels): glbid = self.get_global_albumid_by_refer(url) if not glbid: return def parser_ablum_pages(self, sels): pass def ignore_crawl_link(self, url): return self.__extend_map_handler.settings.ignore_link_to_crawler(url) def get_album_info(self, albumid): return self.album_infos[albumid] \ if self.album_infos.has_key(albumid) else {} def get_album_infos(self): return self.album_infos def debug_album_infos(self): for k, v in self.album_infos.items(): print '-------', 'albuminfo', '-------' for k1, v1 in v.items(): print k1, v1
# encoding=UTF-8 ''' Created on 2013-4-20 @author: Administrator ''' from robot.configutil import ConfigFile from scrapy.cmdline import execute from scrapy.settings import CrawlerSettings if __name__ == '__main__': cfg_file = r'fetchqkyj.cfg' configdata = ConfigFile.readconfig(cfg_file).data import_modules = __import__('fost.zxy.qkyj.settings', globals={}, locals={}, fromlist=[ '', ]) values = dict(configdata) settings = CrawlerSettings(import_modules, values=values) execute([ 'scrapy', 'crawl', 'ZXY_QKYJ_Spider', ], settings=settings)
def test_projects_resource(temp_projects_dir): """Create a ProjectsResource configured to use test settings""" crawler_settings = CrawlerSettings(settings_module=test_settings) projects = ProjectsResource(crawler_settings) projects.projectsdir = temp_projects_dir return projects
from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy.settings import CrawlerSettings from scrapy import log, signals from scrapy.xlib.pydispatch import dispatcher from finance.spiders.financeSpider import FinanceSpider import sys def stop_reactor(): reactor.stop() dispatcher.connect(stop_reactor, signal=signals.spider_closed) spider = FinanceSpider(symbol=sys.argv[1]) crawler = Crawler(CrawlerSettings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start(loglevel=log.ERROR) log.msg('Running reactor...') reactor.run() # the script will block here log.msg('Reactor stopped.') print spider.risultato
def setUp(self): """Initialize the test.""" crawler = Crawler(CrawlerSettings(settings)) crawler.configure() self.spider = ebird_spider.EBirdSpider('REG') self.spider.set_crawler(crawler) self.lista = { 'meta': { 'version': 1, 'language': 'en', }, 'identifier': 'S0000001', 'date': '2013-03-27', 'protocol': { 'time': '09:00', }, 'observers': { 'names': ['Name Surname'], 'count': 1, }, 'source': { 'name': 'eBird', 'submitted_by': 'Name Surname', }, 'location': { 'identifier': 'L0000001', 'name': 'Location 1', 'county': 'County', 'region': 'Region', 'country': 'Country', 'lat': 45.0, 'lon': -45.0, }, 'entries': [{ 'identifier': 'OBS0000001', 'species': { 'name': 'Common Name', }, 'count': 23 }] } self.listb = { 'meta': { 'version': 1, 'language': 'en', }, 'source': 'ebird', 'url': 'http://ebird.org/', 'observers': { 'names': ['Other Name'], 'count': 1, }, 'activity': 'Birding', 'protocol': { 'name': 'Traveling', 'duration_hours': 2, 'duration_minutes': 35, 'distance': 2000, 'area': 0, }, 'comment': 'A comment', 'entries': [{ 'species': { 'name': 'Common Name', }, 'count': 23, 'details': [{ 'age': 'AD', 'sex': 'M', 'count': 9 }, { 'age': 'AD', 'sex': 'F', 'count': 6 }, { 'age': 'JUV', 'sex': 'X', 'count': 8 }] }] } self.fixture = self.spider.merge_checklists(self.lista, self.listb)
def __init__(self, module_path = 'le_crawler.base.url_normalize_settings'): module_path = module_path __import__(module_path) self.__settings = CrawlerSettings(settings_module = sys.modules[module_path]) self.__load_settings()
class UrlNormalize(): #def init_onece(self, *args, **kwargs): def __init__(self, module_path = 'le_crawler.base.url_normalize_settings'): module_path = module_path __import__(module_path) self.__settings = CrawlerSettings(settings_module = sys.modules[module_path]) self.__load_settings() def __convert_map_regs(self, regmap): if not regmap: return {} tmpres = {} for (id, reglist) in regmap.items(): regstmp = [] for r in reglist: #print 're....%s' % r regstmp.append(re.compile(r, re.IGNORECASE)) if regstmp: tmpres[id] = regstmp return tmpres def __accept_reg(self, id_reglist, item): if not item: return None for (id, tmpres) in id_reglist.items(): for r in tmpres: if r.search(item): return id return None def __get_keep_para_list(self, mapdict, id): if mapdict.has_key(id): return mapdict[id] return [] def __get_keep_query_lst(self, id): return self.__get_keep_para_list(self.__keep_query, id) def __keep_fragment(self, id): if not self.__keep_fragments or not self.__keep_fragments.has_key(id): return False return self.__keep_fragments[id] def __update_paras_with_extra(self, input_dict, id): if not self.__extra_para.has_key(id): return input_dict input_dict.update(self.__extra_para[id]) def get_mapping_id(self, url = None, domain = None): # first mapping domain if self.__id_mapping_domain.has_key(domain): return self.__id_mapping_domain(domain) # second try match reg return self.__accept_reg(self.__id_mapping_reg, url) def __set_query_dict(self, org_dict, id): if org_dict is None or id is None: return {} domain_k_p = self.__get_keep_query_lst(id) retdict = {} for (k,ef) in domain_k_p: if org_dict.has_key(k) and org_dict[k] != '': retdict[k] = org_dict[k] elif ef: retdict[k] = '' return retdict def __join_query(self, inputd): if not inputd: return '' query_str = None reslist = sorted(inputd.items(), key = lambda d: d[0], reverse = True) for (k, v) in reslist: if query_str: query_str += '&%s=%s' % (k, v[0]) query_str = '%s=%s' % (k, v[0]) return query_str def __load_settings(self): self.__id_mapping_reg = self.__convert_map_regs(self.__settings.getdict('ID_MAPPING_REG', {})) self.__id_mapping_domain = self.__convert_map_regs(self.__settings.getdict('ID_MAPPING_DOMAIN', {})) self.__keep_query = self.__settings.getdict('KEEP_QUERY', {}) self.__keep_fragments = self.__settings.getdict('KEEP_FRAGEMENT', {}) self.__extra_para = self.__settings.getdict('ADD_EXTRA_PARA', {}) def get_unique_url(self, url, scheme = None, netloc = None, domain = None, no_conf_no_oper = False): id = self.get_mapping_id(url = url, domain = domain) if id is None: if not no_conf_no_oper: id = 'DEFAULT' else: return url if id is None or url is None: raise Exception('Failed get mapping id for: %s, %s' % (domain, url)) urlp = urlparse.urlsplit(url.strip(), allow_fragments = self.__keep_fragment(id)) if not urlp: raise Exception('Failed convert urlparse %s' % url) nscheme = urlp.scheme or scheme nnetloc = urlp.netloc or netloc qdict = urlparse.parse_qs(urlp.query) fqdict = self.__set_query_dict(qdict, id) self.__update_paras_with_extra(fqdict, id) nquery = self.__join_query(fqdict) return urlparse.urlunsplit((nscheme, nnetloc, urlp.path, nquery, urlp.fragment)).strip()
def __contains__(self, item): return bool(CrawlerSettings.__getitem__(self, item))
def _get_settings(settings_dict=None): settings_module = type('SettingsModuleMock', (object, ), settings_dict or {}) return CrawlerSettings(settings_module)
def fetch51freeproxy(): values = configdata.get(const.vpsettings, {}) settings = CrawlerSettings(values=values) execute(argv=["scrapy", "crawl", "FOSpider"], settings=settings)