def main():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    
    
    def catch_item(sender, item, **kwargs):
        print "Got:", item

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(MySpider())

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
class CrawlerWorker(multiprocessing.Process):
 
    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue
 
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
 
        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)
    #__init__
    
    def _item_passed(self, item):
        self.items.append(item)
    # _item_passed
    
    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
    #run
 def handle(self, *args, **options):
     from scrapy import signals
     from scrapy.xlib.pydispatch import dispatcher
     
     def catch_item(sender, item, **kwargs):
         print "Got:", item
         
     dispatcher.connect(catch_item, signal=signals.item_passed)
     
     from scrapy.conf import settings
     settings.overrides['LOG_ENABLED'] = True
     
     from scrapy.crawler import CrawlerProcess
     
     crawler = CrawlerProcess(settings)
     crawler.install()
     crawler.configure()
     
     from alescspider.spiders import *
     spiders = [deputado_spider.DeputadoSpider()]
     #spiders = [presenca_spider.PresencaSpider(), votos_spider.VotosSpider(), deputado_spider.DeputadoSpider()]
     for spider in spiders:
         crawler.queue.append_spider(spider)
     
     print "STARTING ENGINE"
     crawler.start()
     print "ENGINE STOPPED"
     
Beispiel #4
0
def scrapeando():
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        """Rellenamos la BD"""
        for i in enumerate(item.items()):
            x = i[0]
            query = "INSERT INTO book (Nombre ,Autor, Editorial ,Fecha, Precio, Link) VALUES ("+decodifica(item['Nombre'][x])+","+decodifica(item['Autor'][x])+","+decodifica(item['Editorial'][x])+","+decodifica(item['Fecha'][x])+","+decodifica(item['Precio'][x])+","+decodifica("http://www.casadellibro.com"+item['Link'][x])+");"
            db.micursor.execute(query)
            db.conexion.commit()
        print item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    book = BookSpider()
    book.busqueda=unicode(search.getbusqueda())
    crawler.crawl(book)
    print "Start scraping to la Casa del Libro"
    crawler.start()
    print "End scraping to la Casa del Libro"
    crawler.stop()
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.settings import CrawlerSettings
      
    def catch_item(sender, item, **kwargs):
        #log.msg("Got:" + str(item))
        pass
       
    dispatcher.connect(catch_item, signal=signals.item_passed)

    """clean storage"""
    scraperwiki.sqlite.execute("drop table if exists "+spider.name)
    scraperwiki.sqlite.commit()


    from scrapy.crawler import CrawlerProcess

    settings = CrawlerSettings(values=settings)

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)

    #log.start(loglevel='DEBUG')

    crawler.start()
Beispiel #6
0
class CrawlerScript():

    def __init__(self):
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
        self.items = []
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def _crawl(self, queue, spider_name):
        spider = self.crawler.spiders.create(spider_name)
        if spider:
            self.crawler.queue.append_spider(spider)
        self.crawler.start()
        self.crawler.stop()
        queue.put(self.items)

    def crawl(self, spider):
        queue = Queue()
        p = Process(target=self._crawl, args=(queue, spider,))
        p.start()
        p.join()
        return queue.get(True)
Beispiel #7
0
def main():
	"""Rutina principal para la ejecución del Spider"""
	# set up signal to catch items scraped
	from scrapy import signals
	from scrapy.xlib.pydispatch import dispatcher

	def catch_item(sender, item, **kwargs):
		print "Item Extraido:", item
	dispatcher.connect(catch_item, signal=signals.item_passed)

	from scrapy.conf import settings
	settings.overrides['LOG_ENABLED'] = False

	# setup crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)
	crawler.install()
	crawler.configure()

	# definir el spider para el crawler
	crawler.crawl(BloggerSpider())

	# iniciar scrapy
	print "STARTING ENGINE"
	crawler.start()
	print "ENGINE STOPPED"
Beispiel #8
0
def test_cp():
    crawlerProcess = CrawlerProcess(scrapy_conf)
    crawlerProcess.install()
    crawlerProcess.configure()    

    crawlerProcess.queue.append_spider(myspider)
    crawlerProcess.start()
Beispiel #9
0
def create_crawler(spider):
    '''Setups item signal and run the spider'''
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
         print "Got:", item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(spider)

    return crawler
def main():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    options = parse_args()

    dispatcher.connect(catch_item, signal=signals.item_passed)

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = True
    settings.overrides['DEPTH_LIMIT'] = 2


    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    spider = MySpider(input=options.input, output=options.output)
    crawler.queue.append_spider(spider)

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
def run_spider(spider, settings):
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start()
    crawler.start()
def _run_spider(spider):
    """ private method to reduce boilerplate """
    settings = get_project_settings()
    crawler = CrawlerProcess(settings)
    crawler.configure()
    p = Process(target=_crawl,args=[crawler,spider])
    p.start()
    p.join()
Beispiel #13
0
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start()
    crawler.start()
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start()
    crawler.start()
Beispiel #15
0
def run_spider(spider, settings=None):
    """Run a spider instance through the scrapy crawler.

    This function is suitable for standalone scripts.
    """
    crawler = CrawlerProcess(_build_settings(settings))
    crawler.install()
    crawler.configure()
    log.start_from_crawler(crawler)
    crawler.crawl(spider)
    crawler.start()
def run_spider(spider, settings=None):
    """Run a spider instance through the scrapy crawler.

    This function is suitable for standalone scripts.
    """
    crawler = CrawlerProcess(_build_settings(settings))
    crawler.install()
    crawler.configure()
    log.start_from_crawler(crawler)
    crawler.crawl(spider)
    crawler.start()
Beispiel #17
0
def run_tide_scrapy(stationID, startDate, endDate, **kwargs):
    settings.overrides.update({}) # your settings
    crawlerProcess = CrawlerProcess(settings)
    crawlerProcess.install()
    crawlerProcess.configure()
    spider = TideSpider(stationID, startDate, endDate)
    crawlerProcess.crawl(spider)
    try:
        crawlerProcess.start()
    except:
        print "error"
Beispiel #18
0
def run_tide_scrapy(stationID, startDate, endDate, **kwargs):
    settings.overrides.update({})  # your settings
    crawlerProcess = CrawlerProcess(settings)
    crawlerProcess.install()
    crawlerProcess.configure()
    spider = TideSpider(stationID, startDate, endDate)
    crawlerProcess.crawl(spider)
    try:
        crawlerProcess.start()
    except:
        print "error"
def run_spider(spider, settings):
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(spider)
    
    log.start()
    # start engine scrapy/twisted
    crawler.start()
Beispiel #20
0
def run_water_spider(startDate, endDate, **kwargs):
    water_crawlerProcess = CrawlerProcess(settings)
    water_crawlerProcess.install()
    water_crawlerProcess.configure()
    
    spider = WaterSpider("8735180", sys.argv[1], sys.argv[2])
    water_crawlerProcess.crawl(spider)
    try:
    	water_crawlerProcess.start()
    	water_crawlerProcess.stop()
    	water_crawlerProcess.uninstall()
    except Exception as e:
    	print e
Beispiel #21
0
def run_river_spider(startDate, endDate, **kwargs):
    water_crawlerProcess = CrawlerProcess(settings)
    water_crawlerProcess.install()
    water_crawlerProcess.configure()

    spider = RiverSpider(sys.argv[1], sys.argv[2])
    water_crawlerProcess.crawl(spider)
    try:
        water_crawlerProcess.start()
        water_crawlerProcess.stop()
        water_crawlerProcess.uninstall()
    except Exception as e:
        print e
Beispiel #22
0
def run_wind_spider(startDate, endDate, **kwargs):
    wind_crawlerProcess = CrawlerProcess(settings)
    wind_crawlerProcess.install()
    wind_crawlerProcess.configure()
    spider2 = WindSpider("dpia1", sys.argv[1], sys.argv[2])

    wind_crawlerProcess.crawl(spider2)
    try:
        wind_crawlerProcess.start()
        wind_crawlerProcess.stop()
        wind_crawlerProcess.uninstall()
    except Exception as e:
        print e
Beispiel #23
0
def runscrapy(stationID, startDate, endDate, **kwargs):
    crawlerProcess = CrawlerProcess(settings)
    crawlerProcess.install()
    crawlerProcess.configure()

    spider = Spider(stationID, startDate, endDate)
    crawlerProcess.crawl(spider)
    try:
        crawlerProcess.start()
        crawlerProcess.stop()
        crawlerProcess.uninstall()
    except Exception as e:
        print e
Beispiel #24
0
def runscrapy(stationID, startDate, endDate, **kwargs):
    crawlerProcess = CrawlerProcess(settings)
    crawlerProcess.install()
    crawlerProcess.configure()

    spider = Spider(stationID, startDate, endDate)
    crawlerProcess.crawl(spider)
    try:
        crawlerProcess.start()
        crawlerProcess.stop()
        crawlerProcess.uninstall()
    except Exception as e:
        print e
Beispiel #25
0
def run_wind_spider(startDate, endDate, **kwargs):
    wind_crawlerProcess = CrawlerProcess(settings)
    wind_crawlerProcess.install()
    wind_crawlerProcess.configure()
    spider2 = WindSpider("dpia1", sys.argv[1], sys.argv[2])

    wind_crawlerProcess.crawl(spider2)
    try:
        wind_crawlerProcess.start()
        wind_crawlerProcess.stop()
        wind_crawlerProcess.uninstall()
    except Exception as e:
        print e
Beispiel #26
0
def main_spider():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    # setting
    setting()
    
    # set log
    start(logfile = 'log/spider/spider.log',loglevel = 'INFO',logstdout = False)
    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(coreSpider())

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print '********************'
    c = coreSpider()
    getinfo = c.get_getinfo()
    print getinfo
    urls = c.get_urls()
    forms = c.get_forms(urls) 
    print forms
    print len(c.get_urls())
    print "ENGINE STOPPED"
    # scanner
    h = HTTP()
    a = Attack_XSS(h)
    tmp = a.attack(getinfo,forms)
    print '%%%%%%%%%%%%%%%'
    
    print 'per XSS start'
    
    p_xss = Attack_permanentXSS(h)
    p_xss.attack_p(getinfo, forms, tmp)
Beispiel #27
0
def main_spider():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    # setting
    setting()

    # set log
    start(logfile='log/spider/spider.log', loglevel='INFO', logstdout=False)
    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(coreSpider())

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print '********************'
    c = coreSpider()
    getinfo = c.get_getinfo()
    print getinfo
    urls = c.get_urls()
    forms = c.get_forms(urls)
    print forms
    print len(c.get_urls())
    print "ENGINE STOPPED"
    # scanner
    h = HTTP()
    a = Attack_XSS(h)
    tmp = a.attack(getinfo, forms)
    print '%%%%%%%%%%%%%%%'

    print 'per XSS start'

    p_xss = Attack_permanentXSS(h)
    p_xss.attack_p(getinfo, forms, tmp)
Beispiel #28
0
def run_spider(spider, settings, loglevel='INFO'):
    """
    Run a spider with given settings
    """
    if 'SENTRY_DSN' in os.environ:
        import scrapy_sentry
        scrapy_sentry.init(os.environ['SENTRY_DSN'])
        settings.overrides.update({
            'SENTRY_SIGNALS': ['spider_error']
        })
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start(loglevel=loglevel)
    crawler.start()
def get_spiders():
    """returns a dict of spiders
    """
    settings = get_project_settings()
    crawler = CrawlerProcess(settings)
    crawler.settings = settings
    crawler.configure()

    spiders = {}
    for spname in crawler.spiders.list():
        spider = crawler.spiders.create(spname)
        module_name = spider.__module__
        if not "_feedspider" in module_name:
            match_obj = re.match(r"openrecipes\.spiders\.([a-zA-Z0-9]+)_spider", module_name)
            if match_obj:
                short_name = match_obj.group(1)
                spiders[short_name] = spider

    return spiders
Beispiel #30
0
def run_tests(spider, output_file, settings):
    """
    Helper for running test contractors for a spider and output an
    XUnit file (for CI)

    For using offline input the HTTP cache is enabled
    """

    settings.overrides.update({
        "HTTPCACHE_ENABLED": True,
        "HTTPCACHE_EXPIRATION_SECS": 0,
    })

    crawler = CrawlerProcess(settings)

    contracts = build_component_list(
        crawler.settings['SPIDER_CONTRACTS_BASE'],
        crawler.settings['SPIDER_CONTRACTS'],
    )

    xunit = Xunit()
    xunit.enabled = True
    xunit.configure(AttributeDict(xunit_file=output_file), Config())
    xunit.stopTest = lambda *x: None

    check = CheckCommand()
    check.set_crawler(crawler)
    check.settings = settings
    check.conman = ContractsManager([load_object(c) for c in contracts])
    check.results = xunit
    # this are specially crafted requests that run tests as callbacks
    requests = check.get_requests(spider)

    crawler.install()
    crawler.configure()
    crawler.crawl(spider, requests)
    log.start(loglevel='DEBUG')

    # report is called when the crawler finishes, it creates the XUnit file
    report = lambda: check.results.report(check.results.error_report_file)
    dispatcher.connect(report, signals.engine_stopped)

    crawler.start()
Beispiel #31
0
class CrawlerScript():
    def __init__( self ):
        self.crawler = CrawlerProcess( Settings() )
        self.crawler.install()
        self.crawler.configure()
    def _crawl( self, queue, search ):
        log.start( loglevel = log.DEBUG )
        current_spider = CraigslistSpider()
        if search:
            current_spider.set_search_url( search )
        self.crawler.crawl( current_spider )
        self.crawler.start()
        self.crawler.stop()
        queue.put( current_spider.get_object_list() )
    def crawl( self, search = "" ):
        q = Queue()
        p = Process( target = self._crawl, args = ( q, search ) )
        p.start()
        p.join()
        return q.get()
Beispiel #32
0
def get_spiders():
    """returns a dict of spiders
    """
    settings = get_project_settings()
    crawler = CrawlerProcess(settings)
    crawler.settings = settings
    crawler.configure()

    spiders = {}
    for spname in crawler.spiders.list():
        spider = crawler.spiders.create(spname)
        module_name = spider.__module__
        if not '_feedspider' in module_name:
            match_obj = re.match(r'openrecipes\.spiders\.([a-zA-Z0-9]+)_spider',
                            module_name)
            if match_obj:
                short_name = match_obj.group(1)
                spiders[short_name] = spider

    return spiders
class RunCrawler():
    """RunCrawler runs a crawler in a separate process.

    Useful sources:
    https://groups.google.com/forum/?fromgroups#!topic/scrapy-users/8zL8W3SdQBo
    http://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script
    """
    def __init__(self, settings):
        self.crawler = CrawlerProcess(settings)
        self.crawler.configure()

    def _crawl(self, spider):
        self.crawler.crawl(spider)
        self.crawler.start()
        self.crawler.stop()

    def crawl(self, spider):
        p = Process(target=self._crawl, args=(spider,))
        p.start()
        p.join()
Beispiel #34
0
class CrawlerScript:
    def __init__(self):
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, "crawler"):
            self.crawler.install()
        self.crawler.configure()
        self.items = []
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def crawl(self, queue, spider_name):
        spider = self.crawler.spiders.create(spider_name)
        # if spider:
        #     self.crawler.queue.append_spider(spider)
        # self.crawler.start()
        # self.crawler.stop()
        # queue.put(self.items)
        return spider.crawl()
Beispiel #35
0
class RunCrawler():
    """RunCrawler runs a crawler in a separate process.

    Useful sources:
    https://groups.google.com/forum/?fromgroups#!topic/scrapy-users/8zL8W3SdQBo
    http://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script
    """
    def __init__(self, settings):
        self.crawler = CrawlerProcess(settings)
        self.crawler.configure()

    def _crawl(self, spider):
        self.crawler.crawl(spider)
        self.crawler.start()
        self.crawler.stop()

    def crawl(self, spider):
        p = Process(target=self._crawl, args=(spider, ))
        p.start()
        p.join()
class CrawlerScript():

    def __init__(self, spider, results):
        self.results = results
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.results.put(self.items)
class DomainCrawlerScript():

    def __init__(self):
        self.crawler = CrawlerProcess(settings)
        self.crawler.install()
        self.crawler.configure()

    def _crawl(self, domain_pk):
        domain = Domain.objects.get(
            pk = domain_pk,
        )
        urls = []
        for page in domain.pages.all():
            urls.append(page.url())
        self.crawler.crawl(DomainSpider(urls))
        self.crawler.start()
        self.crawler.stop()

    def crawl(self, domain_pk):
        p = Process(target=self._crawl, args=[domain_pk])
        p.start()
        p.join()
Beispiel #38
0
def main(parser):
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    #shut off logging to the console
    def catch_item(sender, item, **kwargs):
        pass
        
    dispatcher.connect(catch_item, signal=signals.item_passed)

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    settings.overrides['FEED_URI'] = 'stdout.csv'
    settings.overrides['FEED_FORMAT'] = 'csv'
    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
 
    args,opts = parser.parse_args()
    # schedule spider
    ext = tldextract.extract(opts[0])
    allowed_domain =  '.'.join(ext[1:])
    spider = SeoSpider(opts[0],allowed_domain)
    #spider.set_url('http://www.newcustomerworkshop.com')
    
    crawler.crawl(spider)

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
def _run_crawl_process(**kwargs):
    #log.start must be explicitly called
    log.start(loglevel=getattr(django_settings, 'SCRAPY_LOG_LEVEL', 'INFO'))

    # region How to run a crawler in-process
    # examples on how to get this stuff:
    # http://stackoverflow.com/questions/14777910/scrapy-crawl-from-script-always-blocks-script-execution-after-scraping?lq=1
    # http://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script
    # http://stackoverflow.com/questions/7993680/running-scrapy-tasks-in-python
    # http://stackoverflow.com/questions/15564844/locally-run-all-of-the-spiders-in-scrapy
    # https://groups.google.com/forum/#!topic/scrapy-users/d4axj6nPVDw
    # endregion

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    spider = crawler.spiders.create(kwargs['spider'], **kwargs)
    crawler.crawl(spider)

    log.msg('Spider started...')
    crawler.start()
    log.msg('Spider stopped.')
    crawler.stop()
Beispiel #40
0
class CrawlerWorker(multiprocessing.Process):
    
    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue

        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)
    
    def _item_passed(self, item):
        self.items.append(item)

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.settings import CrawlerSettings
      
    def catch_item(sender, item, **kwargs):
        #log.msg("Got:" + str(item))
        pass
       
    dispatcher.connect(catch_item, signal=signals.item_passed)

    from scrapy.crawler import CrawlerProcess

    settings = CrawlerSettings(values=settings)

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)

    #log.start(loglevel='DEBUG')

    crawler.start()
Beispiel #42
0
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.settings import CrawlerSettings
      
    def catch_item(sender, item, **kwargs):
        #log.msg("Got:" + str(item))
        pass
       
    dispatcher.connect(catch_item, signal=signals.item_passed)

    from scrapy.crawler import CrawlerProcess

    settings = CrawlerSettings(values=settings)

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)

    #log.start(loglevel='DEBUG')

    crawler.start()
def _run_crawl_process(**kwargs):
  #log.start must be explicitly called
  log.start(loglevel=getattr(django_settings, 'SCRAPY_LOG_LEVEL', 'INFO'))

  # region How to run a crawler in-process
  # examples on how to get this stuff:
  # http://stackoverflow.com/questions/14777910/scrapy-crawl-from-script-always-blocks-script-execution-after-scraping?lq=1
  # http://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script
  # http://stackoverflow.com/questions/7993680/running-scrapy-tasks-in-python
  # http://stackoverflow.com/questions/15564844/locally-run-all-of-the-spiders-in-scrapy
  # https://groups.google.com/forum/#!topic/scrapy-users/d4axj6nPVDw
  # endregion

  crawler = CrawlerProcess(settings)
  crawler.install()
  crawler.configure()
  spider = crawler.spiders.create(kwargs['spider'], **kwargs)
  crawler.crawl(spider)


  log.msg('Spider started...')
  crawler.start()
  log.msg('Spider stopped.')
  crawler.stop()
Beispiel #44
0
class ScraperTest(TestCase):

    SERVER_URL = 'http://*****:*****@href',
                                  from_detail_page=False)
        self.se_url.save()
        self.se_desc = ScraperElem(
            scraped_obj_attr=self.soa_desc,
            scraper=self.scraper,
            x_path=u'//div/div[@class="description"]/text()',
            from_detail_page=True,
            mandatory=False)
        self.se_desc.save()

        self.sched_rt = SchedulerRuntime()
        self.sched_rt.save()

        self.event_website = EventWebsite(
            pk=1,
            name=u'Event Website',
            scraper=self.scraper,
            url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'),
            scraper_runtime=self.sched_rt,
        )
        self.event_website.save()

        settings.overrides['ITEM_PIPELINES'] = [
            'dynamic_scraper.pipelines.DjangoImagesPipeline',
            'dynamic_scraper.pipelines.ValidationPipeline',
            'scraper.scraper_test.DjangoWriterPipeline',
        ]

        settings.overrides['IMAGES_STORE'] = os.path.join(
            self.PROJECT_ROOT, 'imgs')
        settings.overrides['IMAGES_THUMBS'] = {
            'small': (170, 170),
        }

        self.crawler = CrawlerProcess(settings)
        self.crawler.install()
        self.crawler.configure()

        for name, signal in vars(signals).items():
            if not name.startswith('_'):
                dispatcher.connect(self.record_signal, signal)

    def tearDown(self):
        pass
Beispiel #45
0
# scrapy api
from scrapy import signals, log
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings


def spider_closing(spider):
    """Activates on spider closed signal"""
    log.msg("Closing reactor", level=log.INFO)
    reactor.stop()


log.msg(loglevel=log.DEBUG)
settings = Settings()

# crawl responsibly
settings.set(
    "USER_AGENT",
    "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36"
)
crawler = CrawlerProcess(settings)

# stop reactor when spider closes
crawler.signals.connect(spider_closing)

crawler.configure()
crawler.crawl(spiders())
crawler.start()
reactor.run()