def __init__(self, *args, **kwargs):
        self._set_ref_object(Scraper, **kwargs)
        self.scraper = self.ref_object
        self._set_config(**kwargs)
        
        if self.scraper.checker_type == 'N':
            msg = "No checker defined for scraper!"
            log.msg(msg, log.ERROR)
            raise CloseSpider(msg)
        
        if self.scraper.get_detail_page_url_id_elems().count() != 1:
            msg = 'Checkers can only be used for scraped object classed defined with a single DETAIL_PAGE_URL type id field!'
            log.msg(msg, log.ERROR)
            raise CloseSpider(msg)
        
        if self.scraper.checker_type == '4':
            if not self.scraper.checker_ref_url:
                msg = "Please provide a reference url for your 404 checker (Command: %s)." % (self.command)
                log.msg(msg, log.ERROR)
                raise CloseSpider(msg)
        
        if self.scraper.checker_type == 'X':
            if not self.scraper.checker_x_path or not self.scraper.checker_ref_url:
                msg = "Please provide the necessary x_path fields for your 404_OR_X_PATH checker (Command: %s)." % (self.command)
                log.msg(msg, log.ERROR)
                raise CloseSpider(msg)

        self._set_request_kwargs()
        self._set_meta_splash_args()
        
        self.start_urls.append(self.scraper.checker_ref_url)
        dispatcher.connect(self.response_received, signal=signals.response_received)
Example #2
0
    def __init__(self, *args, **kwargs):
        super(dyersonline_spider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)        

        # parse the csv file to get the product ids
        self.csv_writer = UnicodeWriter(open(os.path.join(Current_dir,'skus_.csv'), 'wb'),dialect='excel')
    def __init__(self, *args, **kwargs):
        self._set_ref_object(Scraper, **kwargs)
        self.scraper = self.ref_object
        self._set_config(**kwargs)
        
        if self.scraper.checker_set.count() == 0:
            msg = "No checkers defined for scraper!"
            log.msg(msg, log.ERROR)
            raise CloseSpider(msg)
        
        for checker in self.scraper.checker_set.all():
            if checker.checker_type == '4':
                if not checker.checker_ref_url:
                    msg = "Please provide a reference url for your checker (%s) (Command: %s)." % (unicode(checker), self.command)
                    log.msg(msg, log.ERROR)
                    raise CloseSpider(msg)
            
            if checker.checker_type == 'X':
                if not checker.checker_x_path or not checker.checker_ref_url:
                    msg = "Please provide the necessary x_path fields for your checker (%s) (Command: %s)." % (unicode(checker), self.command)
                    log.msg(msg, log.ERROR)
                    raise CloseSpider(msg)

        self._set_request_kwargs()
        self._set_meta_splash_args()
        
        dispatcher.connect(self.response_received, signal=signals.response_received)
Example #4
0
 def __init__(self):
     self.con = None
     self.cur = None
     self.count = 0
     self.exist_count = 0
     dispatcher.connect(self.spider_opened, signals.spider_opened)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #5
0
 def __init__(self):
     if os.path.exists(self.not_found):
         os.remove(self.not_found)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     self.cities_lst = parkopedia.settings.loadFileIntoLIst(self.overviews_urls)
     self.cities_lst = list(set(self.cities_lst))
     parkopedia.settings.remaining_cities = copy.deepcopy(self.cities_lst)
Example #6
0
 def __init__(self, spider_modules):
     self.spider_modules = spider_modules
     self._spiders = {}
     for name in self.spider_modules:
         for module in walk_modules(name):
             self._load_spiders(module)
     dispatcher.connect(self.close_spider, signals.spider_closed)
Example #7
0
	def __init__(self,s_url,*args,**kwargs):
		super(ArtSpider, self).__init__(*args, **kwargs)
		self.allowed_domains=['findartinfo.com']
		dispatcher.connect(self.spider_closed, signals.spider_closed)
		
		
		self.start_urls=[s_url]
 def __init__(self, urls_fname, export_path, web_repository_export_path):
     """
         :param urls_fname = file name (or path) of the file containing the list 'seed' URLs - one per line
         :param export_path = the base name of the files we are going to export 'items' to (web graph/linking information)
         :param web_repository_export_path = the directory where we are going to save the GZipped content of the pages
     """
     print "Starting WebCrawlerSpider..."
     self.start_urls = self.get_start_urls(urls_fname)
     self.set_of_starting_urls = set(self.start_urls)
     # Add the starting URLs to the set of seen ones, so that we do not crawl them twice...
     self.urls_seen = set(
         self.start_urls
     )  # this set will be modified afterwards, no it is no a duplicated of self.set_of_starting_urls
     print "Loaded", len(self.start_urls), "starting urls"
     self.export_results_filename = export_path
     self.start_time = time()
     self.crawled_pages = 0
     self.web_pages_export_dir = web_repository_export_path
     self.last_stats_print_time = time()
     self.last_stats_print_n_crawled = 0
     # The 4 following variables are going to be used to compute the avg number of outlinks that we are following
     # in order to understand why we crawl so many less pages than expected
     self.outlinks_followed = 0
     self.outlinks_other_domains_followed = 0
     self.outlinks_same_domains_followed = 0
     self.outlinks_average_divider = 0
     super(WebCrawlerSpider, self).__init__()
     dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #9
0
File: t.py Project: szqh97/test
def main():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
 
    def catch_item(sender, item, **kwargs):
        print "Got:", item
 
    dispatcher.connect(catch_item, signal=signals.item_passed)
 
    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False
 
    # set up crawler
    from scrapy.crawler import CrawlerProcess
 
    crawler = CrawlerProcess(settings)
    crawler.start()
 
    # schedule spider
 
    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
Example #10
0
 def __init__(self, basedir):
     if '://' in basedir:
         basedir = basedir.split('://', 1)[1]
     self.basedir = basedir
     self._mkdir(self.basedir)
     self.created_directories = defaultdict(set)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #11
0
    def __init__(self):
        from scrapy.core import signals
        from scrapy.xlib.pydispatch import dispatcher

        self.conn = None
        dispatcher.connect(self.initialize, signals.engine_started)
        dispatcher.connect(self.finalize, signals.engine_stopped)
Example #12
0
    def __init__(self, url=None):

        #print "here i am"
        if url:
            # retrieve with post method, put for create, get for read, delete for delete
            # unvisitedurls http://localhost:5000/unvisitedurls?start=0&offset=10&spider=6w
            unirest.timeout(180)
            req = unirest.post(url, headers={"Accept":"application/json"})
            self.start_urls = [data['url'] for data in req.body['data']]
            self.name = url[url.find('spider=')+7:]

            self.visitedurldict = OrderedDict()
            self.datadict       = OrderedDict()
            self.filedict       = OrderedDict()
            self.deadurldict    = OrderedDict()

            self.visitedurldict['urls'] = []
            self.datadict['datas']      = []
            self.filedict['files']      = []
            self.deadurldict['urls']    = []

            rules = (
                Rule(sle(allow=("http://book.douban.com/isbn/\d+$")), callback="parse", follow=True),
                Rule(sle(allow=("http://book.douban.com/subject/\d+$")), callback="parse", follow=True),
            )
        # def __del__(self) work
        dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #13
0
def create_crawler(spider):
    '''Setups item signal and run the spider'''
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
         print "Got:", item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(spider)

    return crawler
 def __init__(self, *args, **kwargs):
     super(DjangoChecker, self).__init__(self, *args, **kwargs)
     self._check_checker_config()
     
     self.start_urls.append(self.scrape_url)
     self.scheduler = Scheduler(self.scraper.scraped_obj_class.scraper_scheduler_conf)
     dispatcher.connect(self.response_received, signal=signals.response_received)
Example #15
0
def main():
	"""Rutina principal para la ejecución del Spider"""
	# set up signal to catch items scraped
	from scrapy import signals
	from scrapy.xlib.pydispatch import dispatcher

	def catch_item(sender, item, **kwargs):
		print "Item Extraido:", item
	dispatcher.connect(catch_item, signal=signals.item_passed)

	from scrapy.conf import settings
	settings.overrides['LOG_ENABLED'] = False

	# setup crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)
	crawler.install()
	crawler.configure()

	# definir el spider para el crawler
	crawler.crawl(BloggerSpider())

	# iniciar scrapy
	print "STARTING ENGINE"
	crawler.start()
	print "ENGINE STOPPED"
Example #16
0
    def __init__(self):
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        if self.inputs_urls == "part_numbers.txt":
            parts_lst = list()
            parts = moxa.settings.loadFileIntoLIst(self.inputs_urls)
            # for part in parts:
            #     part_no = part.replace('-', '_')
            #     part_no = part_no.replace(' ', '_').strip() + '.htm'
            #     parts_lst.append(part_no)
            # parts_lst = [urljoin('http://www.moxa.com/product/', x) for x in parts_lst]
            # for p in parts_lst:
            #     print p
            self.inputs_urls_lst = parts  # copy.deepcopy(list(set(parts_lst)))
        elif self.inputs_urls == "part_numbers_and_urls.csv":
            parts_and_urls = moxa.settings.loadFileIntoLIst(self.inputs_urls)
            for p in parts_and_urls:
                purl = p.split(",")
                if len(purl) == 2:
                    part_number = purl[0]
                    part_url = purl[1]
                    self.parts[part_number] = part_url
        else:
            self.inputs_urls_lst = moxa.settings.loadFileIntoLIst(self.inputs_urls)

        moxa.settings.remaining_urls = copy.deepcopy(self.inputs_urls_lst)
Example #17
0
File: run.py Project: CkuT/crawlers
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback, scrapy_settings=None):
    """
    Launch crawl job for JobSpider class
    :param scrapy_settings: dict of setting merged with CrawlerProcess default settings
    :param debug: (bool) Activate or disable debug
    :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error)
    :param connector: Connector instance
    :param spiders_classes: JobSpider class list
    :return: spider instance
    """
    if debug:
        dispatcher.connect(spider_error_callback, signals.spider_error)

    settings = {
        'ITEM_PIPELINES': {
            'pyjobs_crawlers.pipelines.RecordJobPipeline': 1,
        },
        'connector': connector,
        'LOG_ENABLED': False,
        'DOWNLOAD_DELAY': 1 if not debug else 0,
    }
    if scrapy_settings:
        settings.update(scrapy_settings)

    process = CrawlerProcess(settings)

    for spider_class in spiders_classes:
        process.crawl(spider_class, debug=debug)

    spiders = []
    for crawler in list(process.crawlers):
        spiders.append(crawler.spider)
    process.start()

    return spiders
Example #18
0
 def __init__(self, settings):
     super(OffsiteMiddleware, self).__init__(settings)
     self.host_regexes = {}
     self.domains_seen = {}
     
     dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
     dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
Example #19
0
 def __init__(self):
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     self.file = codecs.open('qiche.json', 'wb', encoding='utf-8')
     self.bucketname = 'scrapy_data_2'
     self.conn = S3Connection()
     self.getbucket = self.conn.get_bucket(self.bucketname)
     self.k = Key(self.getbucket)
    def test_priorization(self):
        webdriver = Mock()
        settings = self.settings(WEBDRIVER_BROWSER=webdriver)
        webdriver.get.side_effect = self._wait
        webdriver.page_source = u''

        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

        crawler = Crawler(Settings(values=settings))
        crawler.configure()
        spider = self.Spider(name='test', domain='testdomain')
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel='ERROR')
        reactor.run()

        assert webdriver.get.mock_calls == [
            call('http://testdomain/path?wr=0'),
            call('http://testdomain/path?wr=0&wa=0'),
            call('http://testdomain/path?wr=0&wa=1'),
            call('http://testdomain/path?wr=1'),
            call('http://testdomain/path?wr=1&wa=0'),
            call('http://testdomain/path?wr=1&wa=1'),
            call('http://testdomain/path?wr=0&wa=0&wr=0'),
            call('http://testdomain/path?wr=0&wa=1&wr=0'),
            call('http://testdomain/path?wr=1&wa=0&wr=0'),
            call('http://testdomain/path?wr=1&wa=1&wr=0')]
Example #21
0
    def __init__(self, settings, stats):
        """ Instantiate the Queue, Threads """
        self.queue = Queue()
        self.spider = ""
        self.lock = RLock()
        self.settings = settings
        self.stats = stats
        self.logger = logging.getLogger(self.__class__.__name__)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

        # Instantiate Threads
        for i in range(self.settings['THREADS']):
            t = Thread(target=self.worker)
            t.daemon = True
            t.start()

        # Wait for the queue...
        self.queue.join()

        # Check whether download dir exists.
        if not os.path.exists(self.settings['DOWNLOAD_DIR']):
            self.logger.debug("Directory %s does not exist." %\
                self.settings['DOWNLOAD_DIR'])
            os.mkdir(self.settings['DOWNLOAD_DIR'])
        else:
            self.logger.debug("Directory %s exists!" %\
                self.settings['DOWNLOAD_DIR'])
    def __init__(self, *args, **kwargs):
        super(VisegradSpider, self).__init__(*args, **kwargs)

        vpapi.parliament(self.get_parliament())
        vpapi.authorize(self.get_user(), self.get_password())

        dispatcher.connect(self.spider_opened, signals.spider_opened)
Example #23
0
 def __init__(self):
     self.crawler = CrawlerProcess(settings)
     if not hasattr(project, 'crawler'):
         self.crawler.install()
     self.crawler.configure()
     self.items = []
     dispatcher.connect(self._item_passed, signals.item_passed)
Example #24
0
 def __init__(self):
     for d in settings.ALLOWED_DOMAINS:
         self.filename += d
     self.filename += ".db"
     self.conn = None
     dispatcher.connect(self.initialize, signals.engine_started)
     dispatcher.connect(self.initialize, signals.engine_stopped)
Example #25
0
 def __init__(self, spider,query,results):
     Process.__init__(self)
     self.results = results
     self.items = []
     self.query = query
     self.spider = spider
     dispatcher.connect(self._item_passed, signals.item_passed)
Example #26
0
 def __init__(self):
     dispatcher.connect(self.finalize, scrapy.signals.engine_stopped)
     if not os.path.exists(self.res_folder):
         os.makedirs(self.res_folder)
     self.file_200 = open(self.res_folder + self.list_200, 'w')
     self.file_302 = open(self.res_folder + self.list_302, 'w')
     self.file_others = open(self.res_folder + self.list_others, 'w')
Example #27
0
    def test_send_catch_log(self):
        test_signal = object()
        handlers_called = set()

        def log_received(event):
            handlers_called.add(log_received)
            assert "error_handler" in event['message'][0]
            assert event['logLevel'] == log.ERROR

        txlog.addObserver(log_received)
        dispatcher.connect(self.error_handler, signal=test_signal)
        dispatcher.connect(self.ok_handler, signal=test_signal)
        result = yield defer.maybeDeferred(self._get_result, test_signal, arg='test',
                                           handlers_called=handlers_called)

        assert self.error_handler in handlers_called
        assert self.ok_handler in handlers_called
        assert log_received in handlers_called
        self.assertEqual(result[0][0], self.error_handler)
        self.assert_(isinstance(result[0][1], Failure))
        self.assertEqual(result[1], (self.ok_handler, "OK"))

        txlog.removeObserver(log_received)
        self.flushLoggedErrors()
        dispatcher.disconnect(self.error_handler, signal=test_signal)
        dispatcher.disconnect(self.ok_handler, signal=test_signal)
Example #28
0
 def handle(self, *args, **options):
     from scrapy import signals
     from scrapy.xlib.pydispatch import dispatcher
     
     def catch_item(sender, item, **kwargs):
         print "Got:", item
         
     dispatcher.connect(catch_item, signal=signals.item_passed)
     
     from scrapy.conf import settings
     settings.overrides['LOG_ENABLED'] = True
     
     from scrapy.crawler import CrawlerProcess
     
     crawler = CrawlerProcess(settings)
     crawler.install()
     crawler.configure()
     
     from alescspider.spiders import *
     spiders = [deputado_spider.DeputadoSpider()]
     #spiders = [presenca_spider.PresencaSpider(), votos_spider.VotosSpider(), deputado_spider.DeputadoSpider()]
     for spider in spiders:
         crawler.queue.append_spider(spider)
     
     print "STARTING ENGINE"
     crawler.start()
     print "ENGINE STOPPED"
     
Example #29
0
    def run(self):
        self.port = start_test_site()
        self.portno = self.port.getHost().port

        start_urls = [self.geturl("/"), self.geturl("/redirect")]
        self.spider = TestSpider(start_urls=start_urls)

        for name, signal in vars(signals).items():
            if not name.startswith('_'):
                dispatcher.connect(self.record_signal, signal)

        self.crawler = get_crawler()
        self.crawler.install()
        self.crawler.configure()
        self.crawler.signals.connect(self.item_scraped, signals.item_scraped)
        self.crawler.signals.connect(
            self.request_scheduled, signals.request_scheduled)
        self.crawler.signals.connect(
            self.response_downloaded, signals.response_downloaded)
        self.crawler.crawl(self.spider)
        self.crawler.start()

        self.deferred = defer.Deferred()
        dispatcher.connect(self.stop, signals.engine_stopped)
        return self.deferred
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.settings import CrawlerSettings
      
    def catch_item(sender, item, **kwargs):
        #log.msg("Got:" + str(item))
        pass
       
    dispatcher.connect(catch_item, signal=signals.item_passed)

    """clean storage"""
    scraperwiki.sqlite.execute("drop table if exists "+spider.name)
    scraperwiki.sqlite.commit()


    from scrapy.crawler import CrawlerProcess

    settings = CrawlerSettings(values=settings)

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)

    #log.start(loglevel='DEBUG')

    crawler.start()
Example #31
0
 def __init__(self):
     self.httpHash = {}
     self.socks5Hash = {}
     self.httpFile = open('http.txt', 'w')
     self.socks5File = open('socks5.txt', 'w')
     dispatcher.connect(self.finalize, scrapy.signals.engine_stopped)
Example #32
0
 def __init__(self):
     log.start()
     dispatcher.connect(self.spider_opened, signals.spider_opened)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     self.fjsons = {}
Example #33
0
    def __init__(self, *args, **kwargs):
        #turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs
        logger = logging.getLogger('scrapy.middleware')
        logger.setLevel(logging.WARNING)

        dispatcher.connect(self.save_cookie, signal=signals.spider_closed)

        super().__init__(*args, **kwargs)

        #parse date
        if 'date' not in kwargs:
            self.logger.info(
                'Date attribute not provided, scraping date set to 2004-02-04 (fb launch date)'
            )
            self.date = datetime.today() - timedelta(days=8)
            #self.date = datetime(2004,2,4)
        else:
            self.date = datetime.strptime(kwargs['date'], '%Y-%m-%d')
            self.logger.info(
                'Date attribute provided, fbcrawl will stop crawling at {}'.
                format(kwargs['date']))
        self.year = self.date.year

        #parse start date
        if 'skipto_date' in kwargs:
            self.skipto_date = datetime.strptime(kwargs['skipto_date'],
                                                 '%Y-%m-%d')
            self.logger.info(
                'Skipto Date attribute provided, fbcrawl will start crawling at {}'
                .format(kwargs['skipto_date']))
        else:
            self.skipto_date = datetime.today() - timedelta(days=7)

        #parse lang, if not provided (but is supported) it will be guessed in parse_home
        if 'lang' not in kwargs:
            self.logger.info(
                'Language attribute not provided, fbcrawl will try to guess it from the fb interface'
            )
            self.logger.info(
                'To specify, add the lang parameter: scrapy fb -a lang="LANGUAGE"'
            )
            self.logger.info(
                'Currently choices for "LANGUAGE" are: "en", "es", "fr", "it", "pt"'
            )
            self.lang = '_'
        elif self.lang == 'en' or self.lang == 'es' or self.lang == 'fr' or self.lang == 'it' or self.lang == 'pt':
            self.logger.info(
                'Language attribute recognized, using "{}" for the facebook interface'
                .format(self.lang))
        else:
            self.logger.info('Lang "{}" not currently supported'.format(
                self.lang))
            self.logger.info(
                'Currently supported languages are: "en", "es", "fr", "it", "pt"'
            )
            self.logger.info(
                'Change your interface lang from facebook settings and try again'
            )
            raise AttributeError('Language provided not currently supported')

        #max num of posts to crawl
        if 'max' not in kwargs:
            self.max = int(10e5)
        else:
            self.max = int(kwargs['max'])

        #current year, this variable is needed for proper parse_page recursion
        self.k = datetime.now().year

        self.url_root = 'https://mbasic.facebook.com/'
        self.start_urls = self.load_urllist(
            os.path.join(os.path.dirname(__file__),
                         '../../fp_urls')) + self.load_urllist(
                             os.path.join(os.path.dirname(__file__),
                                          '../../group_urls'))

        self.cookie_path = os.path.join(os.path.dirname(__file__),
                                        '../../cookie.json')
        self.cookie = self.load_cookie()
Example #34
0
 def __init__(self):
     self.filename = self.name + ".txt"
     self.filename_tmp = self.name + "_tmp.txt"
     self.file_tmp = codecs.open(self.filename_tmp, 'a', 'utf-8')
     dispatcher.connect(self.finalize, scrapy.signals.engine_stopped)
Example #35
0
 def __init__(self, **kwargs):
     super().__init__(**kwargs)
     dispatcher.connect(self.quit, signals.spider_closed)
 def __init__(self):
     self.driver = webdriver.Firefox(
         executable_path='C:\Code\Python\Selenium\geckodriver.exe')
     super(GamewebSpider, self).__init__()
     dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #37
0
 def __init__(self):
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     self.connection = sqlite3.connect(settings.DATABASE_NAME)
     self.cursor = self.connection.cursor()
     self.gigs_to_send = []
 def __init__(self, **kwargs):
     self.fail_urls = []
     dispatcher.connect(self.handle_spider_closed, signals.spider_closed)
Example #39
0
 def __init__(self):
     options = webdriver.ChromeOptions()
     options.add_argument('headless')
     self.browser = webdriver.Chrome()
     super(LagouSpider, self).__init__()
     dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #40
0
 def __init__(self):
     super(guiZhouTransformNoticeSpider, self).__init__()
     dispatcher.connect(self.CloseSpider, signals.spider_closed)
     self.targetUrl = 'http://zrzy.guizhou.gov.cn/zfxxgk/zfxxgkml/zdlyxxgkml/tdcrzrgg/index_{}.html'
     self.header = {'User-Agent': random.choice(agent_list)}
     self.reStr = '()\w\.:: 。\(\)〔〕≤;,≥《》\-\/\%,、\.'
Example #41
0
    def __init__(self, *args, **kwargs):
        super(OakfurniturelandcoSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_idle, signals.spider_idle)

        self.parse_all = True
Example #42
0
 def __init__(self, *args, **kwargs):
     super(DfscoSpider, self).__init__(*args, **kwargs)
     dispatcher.connect(self.spider_idle, signals.spider_idle)
Example #43
0
 def __init__ (self):
     super(B2bNewSpider,self).__init__()
     #将final绑定到爬虫结束的事件上
     dispatcher.connect(self.initial,signals.engine_started)
     dispatcher.connect(self.finalize,signals.engine_stopped)
Example #44
0
 def __init__(self):
     dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #45
0
 def __init__(self):
     super(JsonWithEncodingPipeline, self).__init__()
     self.suffix = 'json'
     dispatcher.connect(self.spider_opened, signals.spider_opened)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     self.files = {}
Example #46
0
 def __init__(self, allowed_domains=None, model=None, *args, **kwargs):
     super(GoogleNewsSpider, self).__init__(*args, **kwargs)
     self.isStop = 3
     # 绑定信号-延迟信号
     dispatcher.connect(self.spider_closed, signals.spider_idle)
     dispatcher.connect(self.spider_item, signals.item_scraped)
 def __init__(self):
     self.buffer = 20
     self.data = []
     self.counter = 0
     dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #48
0
 def __init__(self):
     super(CsvExportPipeline, self).__init__()
     self.suffix = 'csv'
     dispatcher.connect(self.spider_opened, signals.spider_opened)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     self.files = {}
Example #49
0
 def __init__(self, filename=None):
 	self.workbook = xlwt.Workbook()
     self.sheet = self.workbook.add_sheet('Sheet_1')
     
     self.driver = webdriver.Firefox()
     dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #50
0
    def __init__(self):
        dispatcher.connect(self.CloseSpider, signals.spider_closed)
        self.targetUrl = 'http://jcjg.nr.gd.gov.cn:8088/GisqReport7.0/ReportServer?reportlet=other/mhgg/crgg.cpt'
        self.header = {'User-Agent': random.choice(agent_list)}

        self.origin_url = 'http://jcjg.nr.gd.gov.cn:8088/GisqReport7.0/ReportServer?_={}&__boxModel__=true&op=page_content&sessionID={}&pn=1'
            i = i + 1
            csk.writerow([
                item['pid'][0], item['rating'][0], item['userid'][0],
                item['date'][0], item['title'][0], item['rbody'][0],
                item['helpful'], item['totalreview'], item['reviewid']
            ])

        next_page = hxs.select(
            '//a[@class="nav_bar_next_prev"]/@href').extract()
        for n in next_page:
            if n:
                yield Request(urlparse.urljoin("http://www.flipkart.com",
                                               n[1:]),
                              callback=self.par_ur)


def stop_reactor():
    reactor.stop()


dispatcher.connect(stop_reactor, signal=signals.spider_closed)
spider = MySpiderd()
crawler = Crawler(Settings())
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start(loglevel=logging.DEBUG)
log.msg('Running reactor...')
reactor.run()  # the script will block here until the spider is closed
log.msg('Reactor stopped.')
Example #52
0
 def __init__(self):
     dispatcher.connect(self.engine_started, signal=signals.engine_started)
     dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
     dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
Example #53
0
    def __init__(self, *args, **kwargs):
        BaseSpider.__init__(self, *args, **kwargs)
        dispatcher.connect(self.process_pending, signals.spider_idle)

        self.run_num = 1
Example #54
0
 def __init__(self):
     logger.debug('开始爬取escience岗位信息')
     self.url = 'https://escience.org.cn'
     self.items = []
     dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #55
0
	def __init__(self, filename=None):
		with open(os.path.dirname(__file__) + '/../../link.txt','r') as f:
			self.start_urls = [f.read()]
		self.driver = webdriver.Chrome()
		dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #56
0
 def __init__(self, *args, **kwargs):
     super(GuitarGuitarAndertons, self).__init__(*args, **kwargs)
     self._all_categories_parsed = False
     dispatcher.connect(self.spider_idle, signals.spider_idle)
     self.seen = set()
Example #57
0
 def __init__(self):
     super(laSaResultNoticeSpider, self).__init__()
     dispatcher.connect(self.CloseSpider, signals.spider_closed)
     self.targetUrl = 'http://ggzy.lasa.gov.cn/Article/SearchArticle'
     self.header = {'User-Agent': random.choice(agent_list)}
     self.reStr = '()\w\.:: 。 \(\)〔〕㎡≤;,≥《》\-\/\%,、\.﹪㎡'
Example #58
0
 def __init__(self):
     self.conn = None
     dispatcher.connect(self.initialize, signals.engine_started)
     dispatcher.connect(self.finalize, signals.engine_stopped)
Example #59
0
 def __init__(self):
     self.browser = webdriver.Chrome(
         executable_path="E:/SE/chromedriver.exe")
     super(JobbleSpider, self).__init__()
     dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #60
0
 def __init__(self):
     dispatcher.connect(self.spider_opened, signals.spider_opened)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     self.links = []
     self.changeCounter = 0