def __init__(self, *args, **kwargs): self._set_ref_object(Scraper, **kwargs) self.scraper = self.ref_object self._set_config(**kwargs) if self.scraper.checker_type == 'N': msg = "No checker defined for scraper!" log.msg(msg, log.ERROR) raise CloseSpider(msg) if self.scraper.get_detail_page_url_id_elems().count() != 1: msg = 'Checkers can only be used for scraped object classed defined with a single DETAIL_PAGE_URL type id field!' log.msg(msg, log.ERROR) raise CloseSpider(msg) if self.scraper.checker_type == '4': if not self.scraper.checker_ref_url: msg = "Please provide a reference url for your 404 checker (Command: %s)." % (self.command) log.msg(msg, log.ERROR) raise CloseSpider(msg) if self.scraper.checker_type == 'X': if not self.scraper.checker_x_path or not self.scraper.checker_ref_url: msg = "Please provide the necessary x_path fields for your 404_OR_X_PATH checker (Command: %s)." % (self.command) log.msg(msg, log.ERROR) raise CloseSpider(msg) self._set_request_kwargs() self._set_meta_splash_args() self.start_urls.append(self.scraper.checker_ref_url) dispatcher.connect(self.response_received, signal=signals.response_received)
def __init__(self, *args, **kwargs): super(dyersonline_spider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) # parse the csv file to get the product ids self.csv_writer = UnicodeWriter(open(os.path.join(Current_dir,'skus_.csv'), 'wb'),dialect='excel')
def __init__(self, *args, **kwargs): self._set_ref_object(Scraper, **kwargs) self.scraper = self.ref_object self._set_config(**kwargs) if self.scraper.checker_set.count() == 0: msg = "No checkers defined for scraper!" log.msg(msg, log.ERROR) raise CloseSpider(msg) for checker in self.scraper.checker_set.all(): if checker.checker_type == '4': if not checker.checker_ref_url: msg = "Please provide a reference url for your checker (%s) (Command: %s)." % (unicode(checker), self.command) log.msg(msg, log.ERROR) raise CloseSpider(msg) if checker.checker_type == 'X': if not checker.checker_x_path or not checker.checker_ref_url: msg = "Please provide the necessary x_path fields for your checker (%s) (Command: %s)." % (unicode(checker), self.command) log.msg(msg, log.ERROR) raise CloseSpider(msg) self._set_request_kwargs() self._set_meta_splash_args() dispatcher.connect(self.response_received, signal=signals.response_received)
def __init__(self): self.con = None self.cur = None self.count = 0 self.exist_count = 0 dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self): if os.path.exists(self.not_found): os.remove(self.not_found) dispatcher.connect(self.spider_closed, signals.spider_closed) self.cities_lst = parkopedia.settings.loadFileIntoLIst(self.overviews_urls) self.cities_lst = list(set(self.cities_lst)) parkopedia.settings.remaining_cities = copy.deepcopy(self.cities_lst)
def __init__(self, spider_modules): self.spider_modules = spider_modules self._spiders = {} for name in self.spider_modules: for module in walk_modules(name): self._load_spiders(module) dispatcher.connect(self.close_spider, signals.spider_closed)
def __init__(self,s_url,*args,**kwargs): super(ArtSpider, self).__init__(*args, **kwargs) self.allowed_domains=['findartinfo.com'] dispatcher.connect(self.spider_closed, signals.spider_closed) self.start_urls=[s_url]
def __init__(self, urls_fname, export_path, web_repository_export_path): """ :param urls_fname = file name (or path) of the file containing the list 'seed' URLs - one per line :param export_path = the base name of the files we are going to export 'items' to (web graph/linking information) :param web_repository_export_path = the directory where we are going to save the GZipped content of the pages """ print "Starting WebCrawlerSpider..." self.start_urls = self.get_start_urls(urls_fname) self.set_of_starting_urls = set(self.start_urls) # Add the starting URLs to the set of seen ones, so that we do not crawl them twice... self.urls_seen = set( self.start_urls ) # this set will be modified afterwards, no it is no a duplicated of self.set_of_starting_urls print "Loaded", len(self.start_urls), "starting urls" self.export_results_filename = export_path self.start_time = time() self.crawled_pages = 0 self.web_pages_export_dir = web_repository_export_path self.last_stats_print_time = time() self.last_stats_print_n_crawled = 0 # The 4 following variables are going to be used to compute the avg number of outlinks that we are following # in order to understand why we crawl so many less pages than expected self.outlinks_followed = 0 self.outlinks_other_domains_followed = 0 self.outlinks_same_domains_followed = 0 self.outlinks_average_divider = 0 super(WebCrawlerSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
def main(): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item dispatcher.connect(catch_item, signal=signals.item_passed) # shut off log from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.start() # schedule spider # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
def __init__(self, basedir): if '://' in basedir: basedir = basedir.split('://', 1)[1] self.basedir = basedir self._mkdir(self.basedir) self.created_directories = defaultdict(set) dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self): from scrapy.core import signals from scrapy.xlib.pydispatch import dispatcher self.conn = None dispatcher.connect(self.initialize, signals.engine_started) dispatcher.connect(self.finalize, signals.engine_stopped)
def __init__(self, url=None): #print "here i am" if url: # retrieve with post method, put for create, get for read, delete for delete # unvisitedurls http://localhost:5000/unvisitedurls?start=0&offset=10&spider=6w unirest.timeout(180) req = unirest.post(url, headers={"Accept":"application/json"}) self.start_urls = [data['url'] for data in req.body['data']] self.name = url[url.find('spider=')+7:] self.visitedurldict = OrderedDict() self.datadict = OrderedDict() self.filedict = OrderedDict() self.deadurldict = OrderedDict() self.visitedurldict['urls'] = [] self.datadict['datas'] = [] self.filedict['files'] = [] self.deadurldict['urls'] = [] rules = ( Rule(sle(allow=("http://book.douban.com/isbn/\d+$")), callback="parse", follow=True), Rule(sle(allow=("http://book.douban.com/subject/\d+$")), callback="parse", follow=True), ) # def __del__(self) work dispatcher.connect(self.spider_closed, signals.spider_closed)
def create_crawler(spider): '''Setups item signal and run the spider''' from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item dispatcher.connect(catch_item, signal=signals.item_passed) # shut off log from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # schedule spider crawler.crawl(spider) return crawler
def __init__(self, *args, **kwargs): super(DjangoChecker, self).__init__(self, *args, **kwargs) self._check_checker_config() self.start_urls.append(self.scrape_url) self.scheduler = Scheduler(self.scraper.scraped_obj_class.scraper_scheduler_conf) dispatcher.connect(self.response_received, signal=signals.response_received)
def main(): """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Item Extraido:", item dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # definir el spider para el crawler crawler.crawl(BloggerSpider()) # iniciar scrapy print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
def __init__(self): dispatcher.connect(self.spider_closed, signals.spider_closed) if self.inputs_urls == "part_numbers.txt": parts_lst = list() parts = moxa.settings.loadFileIntoLIst(self.inputs_urls) # for part in parts: # part_no = part.replace('-', '_') # part_no = part_no.replace(' ', '_').strip() + '.htm' # parts_lst.append(part_no) # parts_lst = [urljoin('http://www.moxa.com/product/', x) for x in parts_lst] # for p in parts_lst: # print p self.inputs_urls_lst = parts # copy.deepcopy(list(set(parts_lst))) elif self.inputs_urls == "part_numbers_and_urls.csv": parts_and_urls = moxa.settings.loadFileIntoLIst(self.inputs_urls) for p in parts_and_urls: purl = p.split(",") if len(purl) == 2: part_number = purl[0] part_url = purl[1] self.parts[part_number] = part_url else: self.inputs_urls_lst = moxa.settings.loadFileIntoLIst(self.inputs_urls) moxa.settings.remaining_urls = copy.deepcopy(self.inputs_urls_lst)
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback, scrapy_settings=None): """ Launch crawl job for JobSpider class :param scrapy_settings: dict of setting merged with CrawlerProcess default settings :param debug: (bool) Activate or disable debug :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error) :param connector: Connector instance :param spiders_classes: JobSpider class list :return: spider instance """ if debug: dispatcher.connect(spider_error_callback, signals.spider_error) settings = { 'ITEM_PIPELINES': { 'pyjobs_crawlers.pipelines.RecordJobPipeline': 1, }, 'connector': connector, 'LOG_ENABLED': False, 'DOWNLOAD_DELAY': 1 if not debug else 0, } if scrapy_settings: settings.update(scrapy_settings) process = CrawlerProcess(settings) for spider_class in spiders_classes: process.crawl(spider_class, debug=debug) spiders = [] for crawler in list(process.crawlers): spiders.append(crawler.spider) process.start() return spiders
def __init__(self, settings): super(OffsiteMiddleware, self).__init__(settings) self.host_regexes = {} self.domains_seen = {} dispatcher.connect(self.spider_opened, signal=signals.spider_opened) dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def __init__(self): dispatcher.connect(self.spider_closed, signals.spider_closed) self.file = codecs.open('qiche.json', 'wb', encoding='utf-8') self.bucketname = 'scrapy_data_2' self.conn = S3Connection() self.getbucket = self.conn.get_bucket(self.bucketname) self.k = Key(self.getbucket)
def test_priorization(self): webdriver = Mock() settings = self.settings(WEBDRIVER_BROWSER=webdriver) webdriver.get.side_effect = self._wait webdriver.page_source = u'' dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) crawler = Crawler(Settings(values=settings)) crawler.configure() spider = self.Spider(name='test', domain='testdomain') crawler.crawl(spider) crawler.start() log.start(loglevel='ERROR') reactor.run() assert webdriver.get.mock_calls == [ call('http://testdomain/path?wr=0'), call('http://testdomain/path?wr=0&wa=0'), call('http://testdomain/path?wr=0&wa=1'), call('http://testdomain/path?wr=1'), call('http://testdomain/path?wr=1&wa=0'), call('http://testdomain/path?wr=1&wa=1'), call('http://testdomain/path?wr=0&wa=0&wr=0'), call('http://testdomain/path?wr=0&wa=1&wr=0'), call('http://testdomain/path?wr=1&wa=0&wr=0'), call('http://testdomain/path?wr=1&wa=1&wr=0')]
def __init__(self, settings, stats): """ Instantiate the Queue, Threads """ self.queue = Queue() self.spider = "" self.lock = RLock() self.settings = settings self.stats = stats self.logger = logging.getLogger(self.__class__.__name__) dispatcher.connect(self.spider_closed, signals.spider_closed) # Instantiate Threads for i in range(self.settings['THREADS']): t = Thread(target=self.worker) t.daemon = True t.start() # Wait for the queue... self.queue.join() # Check whether download dir exists. if not os.path.exists(self.settings['DOWNLOAD_DIR']): self.logger.debug("Directory %s does not exist." %\ self.settings['DOWNLOAD_DIR']) os.mkdir(self.settings['DOWNLOAD_DIR']) else: self.logger.debug("Directory %s exists!" %\ self.settings['DOWNLOAD_DIR'])
def __init__(self, *args, **kwargs): super(VisegradSpider, self).__init__(*args, **kwargs) vpapi.parliament(self.get_parliament()) vpapi.authorize(self.get_user(), self.get_password()) dispatcher.connect(self.spider_opened, signals.spider_opened)
def __init__(self): self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] dispatcher.connect(self._item_passed, signals.item_passed)
def __init__(self): for d in settings.ALLOWED_DOMAINS: self.filename += d self.filename += ".db" self.conn = None dispatcher.connect(self.initialize, signals.engine_started) dispatcher.connect(self.initialize, signals.engine_stopped)
def __init__(self, spider,query,results): Process.__init__(self) self.results = results self.items = [] self.query = query self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed)
def __init__(self): dispatcher.connect(self.finalize, scrapy.signals.engine_stopped) if not os.path.exists(self.res_folder): os.makedirs(self.res_folder) self.file_200 = open(self.res_folder + self.list_200, 'w') self.file_302 = open(self.res_folder + self.list_302, 'w') self.file_others = open(self.res_folder + self.list_others, 'w')
def test_send_catch_log(self): test_signal = object() handlers_called = set() def log_received(event): handlers_called.add(log_received) assert "error_handler" in event['message'][0] assert event['logLevel'] == log.ERROR txlog.addObserver(log_received) dispatcher.connect(self.error_handler, signal=test_signal) dispatcher.connect(self.ok_handler, signal=test_signal) result = yield defer.maybeDeferred(self._get_result, test_signal, arg='test', handlers_called=handlers_called) assert self.error_handler in handlers_called assert self.ok_handler in handlers_called assert log_received in handlers_called self.assertEqual(result[0][0], self.error_handler) self.assert_(isinstance(result[0][1], Failure)) self.assertEqual(result[1], (self.ok_handler, "OK")) txlog.removeObserver(log_received) self.flushLoggedErrors() dispatcher.disconnect(self.error_handler, signal=test_signal) dispatcher.disconnect(self.ok_handler, signal=test_signal)
def handle(self, *args, **options): from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = True from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() from alescspider.spiders import * spiders = [deputado_spider.DeputadoSpider()] #spiders = [presenca_spider.PresencaSpider(), votos_spider.VotosSpider(), deputado_spider.DeputadoSpider()] for spider in spiders: crawler.queue.append_spider(spider) print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
def run(self): self.port = start_test_site() self.portno = self.port.getHost().port start_urls = [self.geturl("/"), self.geturl("/redirect")] self.spider = TestSpider(start_urls=start_urls) for name, signal in vars(signals).items(): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal) self.crawler = get_crawler() self.crawler.install() self.crawler.configure() self.crawler.signals.connect(self.item_scraped, signals.item_scraped) self.crawler.signals.connect( self.request_scheduled, signals.request_scheduled) self.crawler.signals.connect( self.response_downloaded, signals.response_downloaded) self.crawler.crawl(self.spider) self.crawler.start() self.deferred = defer.Deferred() dispatcher.connect(self.stop, signals.engine_stopped) return self.deferred
def run_spider(spider, settings): """Run a spider with given settings""" from scrapy import signals from scrapy.xlib.pydispatch import dispatcher from scrapy.settings import CrawlerSettings def catch_item(sender, item, **kwargs): #log.msg("Got:" + str(item)) pass dispatcher.connect(catch_item, signal=signals.item_passed) """clean storage""" scraperwiki.sqlite.execute("drop table if exists "+spider.name) scraperwiki.sqlite.commit() from scrapy.crawler import CrawlerProcess settings = CrawlerSettings(values=settings) crawler = CrawlerProcess(settings) crawler.install() crawler.configure() crawler.crawl(spider) #log.start(loglevel='DEBUG') crawler.start()
def __init__(self): self.httpHash = {} self.socks5Hash = {} self.httpFile = open('http.txt', 'w') self.socks5File = open('socks5.txt', 'w') dispatcher.connect(self.finalize, scrapy.signals.engine_stopped)
def __init__(self): log.start() dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.fjsons = {}
def __init__(self, *args, **kwargs): #turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs logger = logging.getLogger('scrapy.middleware') logger.setLevel(logging.WARNING) dispatcher.connect(self.save_cookie, signal=signals.spider_closed) super().__init__(*args, **kwargs) #parse date if 'date' not in kwargs: self.logger.info( 'Date attribute not provided, scraping date set to 2004-02-04 (fb launch date)' ) self.date = datetime.today() - timedelta(days=8) #self.date = datetime(2004,2,4) else: self.date = datetime.strptime(kwargs['date'], '%Y-%m-%d') self.logger.info( 'Date attribute provided, fbcrawl will stop crawling at {}'. format(kwargs['date'])) self.year = self.date.year #parse start date if 'skipto_date' in kwargs: self.skipto_date = datetime.strptime(kwargs['skipto_date'], '%Y-%m-%d') self.logger.info( 'Skipto Date attribute provided, fbcrawl will start crawling at {}' .format(kwargs['skipto_date'])) else: self.skipto_date = datetime.today() - timedelta(days=7) #parse lang, if not provided (but is supported) it will be guessed in parse_home if 'lang' not in kwargs: self.logger.info( 'Language attribute not provided, fbcrawl will try to guess it from the fb interface' ) self.logger.info( 'To specify, add the lang parameter: scrapy fb -a lang="LANGUAGE"' ) self.logger.info( 'Currently choices for "LANGUAGE" are: "en", "es", "fr", "it", "pt"' ) self.lang = '_' elif self.lang == 'en' or self.lang == 'es' or self.lang == 'fr' or self.lang == 'it' or self.lang == 'pt': self.logger.info( 'Language attribute recognized, using "{}" for the facebook interface' .format(self.lang)) else: self.logger.info('Lang "{}" not currently supported'.format( self.lang)) self.logger.info( 'Currently supported languages are: "en", "es", "fr", "it", "pt"' ) self.logger.info( 'Change your interface lang from facebook settings and try again' ) raise AttributeError('Language provided not currently supported') #max num of posts to crawl if 'max' not in kwargs: self.max = int(10e5) else: self.max = int(kwargs['max']) #current year, this variable is needed for proper parse_page recursion self.k = datetime.now().year self.url_root = 'https://mbasic.facebook.com/' self.start_urls = self.load_urllist( os.path.join(os.path.dirname(__file__), '../../fp_urls')) + self.load_urllist( os.path.join(os.path.dirname(__file__), '../../group_urls')) self.cookie_path = os.path.join(os.path.dirname(__file__), '../../cookie.json') self.cookie = self.load_cookie()
def __init__(self): self.filename = self.name + ".txt" self.filename_tmp = self.name + "_tmp.txt" self.file_tmp = codecs.open(self.filename_tmp, 'a', 'utf-8') dispatcher.connect(self.finalize, scrapy.signals.engine_stopped)
def __init__(self, **kwargs): super().__init__(**kwargs) dispatcher.connect(self.quit, signals.spider_closed)
def __init__(self): self.driver = webdriver.Firefox( executable_path='C:\Code\Python\Selenium\geckodriver.exe') super(GamewebSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self): dispatcher.connect(self.spider_closed, signals.spider_closed) self.connection = sqlite3.connect(settings.DATABASE_NAME) self.cursor = self.connection.cursor() self.gigs_to_send = []
def __init__(self, **kwargs): self.fail_urls = [] dispatcher.connect(self.handle_spider_closed, signals.spider_closed)
def __init__(self): options = webdriver.ChromeOptions() options.add_argument('headless') self.browser = webdriver.Chrome() super(LagouSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self): super(guiZhouTransformNoticeSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) self.targetUrl = 'http://zrzy.guizhou.gov.cn/zfxxgk/zfxxgkml/zdlyxxgkml/tdcrzrgg/index_{}.html' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。\(\)〔〕≤;,≥《》\-\/\%,、\.'
def __init__(self, *args, **kwargs): super(OakfurniturelandcoSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_idle, signals.spider_idle) self.parse_all = True
def __init__(self, *args, **kwargs): super(DfscoSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_idle, signals.spider_idle)
def __init__ (self): super(B2bNewSpider,self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial,signals.engine_started) dispatcher.connect(self.finalize,signals.engine_stopped)
def __init__(self): dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self): super(JsonWithEncodingPipeline, self).__init__() self.suffix = 'json' dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {}
def __init__(self, allowed_domains=None, model=None, *args, **kwargs): super(GoogleNewsSpider, self).__init__(*args, **kwargs) self.isStop = 3 # 绑定信号-延迟信号 dispatcher.connect(self.spider_closed, signals.spider_idle) dispatcher.connect(self.spider_item, signals.item_scraped)
def __init__(self): self.buffer = 20 self.data = [] self.counter = 0 dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self): super(CsvExportPipeline, self).__init__() self.suffix = 'csv' dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {}
def __init__(self, filename=None): self.workbook = xlwt.Workbook() self.sheet = self.workbook.add_sheet('Sheet_1') self.driver = webdriver.Firefox() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self): dispatcher.connect(self.CloseSpider, signals.spider_closed) self.targetUrl = 'http://jcjg.nr.gd.gov.cn:8088/GisqReport7.0/ReportServer?reportlet=other/mhgg/crgg.cpt' self.header = {'User-Agent': random.choice(agent_list)} self.origin_url = 'http://jcjg.nr.gd.gov.cn:8088/GisqReport7.0/ReportServer?_={}&__boxModel__=true&op=page_content&sessionID={}&pn=1'
i = i + 1 csk.writerow([ item['pid'][0], item['rating'][0], item['userid'][0], item['date'][0], item['title'][0], item['rbody'][0], item['helpful'], item['totalreview'], item['reviewid'] ]) next_page = hxs.select( '//a[@class="nav_bar_next_prev"]/@href').extract() for n in next_page: if n: yield Request(urlparse.urljoin("http://www.flipkart.com", n[1:]), callback=self.par_ur) def stop_reactor(): reactor.stop() dispatcher.connect(stop_reactor, signal=signals.spider_closed) spider = MySpiderd() crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start(loglevel=logging.DEBUG) log.msg('Running reactor...') reactor.run() # the script will block here until the spider is closed log.msg('Reactor stopped.')
def __init__(self): dispatcher.connect(self.engine_started, signal=signals.engine_started) dispatcher.connect(self.spider_opened, signal=signals.spider_opened) dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def __init__(self, *args, **kwargs): BaseSpider.__init__(self, *args, **kwargs) dispatcher.connect(self.process_pending, signals.spider_idle) self.run_num = 1
def __init__(self): logger.debug('开始爬取escience岗位信息') self.url = 'https://escience.org.cn' self.items = [] dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, filename=None): with open(os.path.dirname(__file__) + '/../../link.txt','r') as f: self.start_urls = [f.read()] self.driver = webdriver.Chrome() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, *args, **kwargs): super(GuitarGuitarAndertons, self).__init__(*args, **kwargs) self._all_categories_parsed = False dispatcher.connect(self.spider_idle, signals.spider_idle) self.seen = set()
def __init__(self): super(laSaResultNoticeSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) self.targetUrl = 'http://ggzy.lasa.gov.cn/Article/SearchArticle' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡≤;,≥《》\-\/\%,、\.﹪㎡'
def __init__(self): self.conn = None dispatcher.connect(self.initialize, signals.engine_started) dispatcher.connect(self.finalize, signals.engine_stopped)
def __init__(self): self.browser = webdriver.Chrome( executable_path="E:/SE/chromedriver.exe") super(JobbleSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.links = [] self.changeCounter = 0