コード例 #1
0
class CrawlerScript():
    def __init__(self):
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
        self.items = []
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def _crawl(self, queue, spider_name):
        spider = self.crawler.spiders.create(spider_name)
        if spider:
            self.crawler.queue.append_spider(spider)
        self.crawler.start()
        self.crawler.stop()
        queue.put(self.items)

    def crawl(self, spider):
        queue = Queue()
        p = Process(target=self._crawl, args=(
            queue,
            spider,
        ))
        p.start()
        p.join()
        return queue.get(True)
コード例 #2
0
class CrawlerWorker(multiprocessing.Process):
 
    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue
 
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
 
        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)
    #__init__
    
    def _item_passed(self, item):
        self.items.append(item)
    # _item_passed
    
    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
    #run
コード例 #3
0
ファイル: foxy.py プロジェクト: claudioharu/MngX
def create_crawler(spider):
    '''Setups item signal and run the spider'''
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
         print "Got:", item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(spider)

    return crawler
コード例 #4
0
ファイル: casaLibro.py プロジェクト: flubbers/AZScraping
def scrapeando():
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        """Rellenamos la BD"""
        for i in enumerate(item.items()):
            x = i[0]
            query = "INSERT INTO book (Nombre ,Autor, Editorial ,Fecha, Precio, Link) VALUES ("+decodifica(item['Nombre'][x])+","+decodifica(item['Autor'][x])+","+decodifica(item['Editorial'][x])+","+decodifica(item['Fecha'][x])+","+decodifica(item['Precio'][x])+","+decodifica("http://www.casadellibro.com"+item['Link'][x])+");"
            db.micursor.execute(query)
            db.conexion.commit()
        print item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    book = BookSpider()
    book.busqueda=unicode(search.getbusqueda())
    crawler.crawl(book)
    print "Start scraping to la Casa del Libro"
    crawler.start()
    print "End scraping to la Casa del Libro"
    crawler.stop()
コード例 #5
0
ファイル: cmdline.py プロジェクト: reenvs/self-summary
def execute(argv=None):
    if argv is None:
        argv = sys.argv
    crawler = CrawlerProcess(settings)
    crawler.install()
    inproject = inside_project()
    _check_deprecated_scrapy_ctl(argv, inproject) # TODO: remove for Scrapy 0.11
    cmds = _get_commands_dict(inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)
    cmd.set_crawler(crawler)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #6
0
def main():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    
    
    def catch_item(sender, item, **kwargs):
        print "Got:", item

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(MySpider())

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
コード例 #7
0
def test_cp():
    crawlerProcess = CrawlerProcess(scrapy_conf)
    crawlerProcess.install()
    crawlerProcess.configure()

    crawlerProcess.queue.append_spider(myspider)
    crawlerProcess.start()
コード例 #8
0
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.settings import CrawlerSettings

    def catch_item(sender, item, **kwargs):
        #log.msg("Got:" + str(item))
        pass

    dispatcher.connect(catch_item, signal=signals.item_passed)
    """clean storage"""
    scraperwiki.sqlite.execute("drop table if exists " + spider.name)
    scraperwiki.sqlite.commit()

    from scrapy.crawler import CrawlerProcess

    settings = CrawlerSettings(values=settings)

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)

    #log.start(loglevel='DEBUG')

    crawler.start()
コード例 #9
0
def main():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(MySpider())

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
コード例 #10
0
 def handle(self, *args, **options):
     from scrapy import signals
     from scrapy.xlib.pydispatch import dispatcher
     
     def catch_item(sender, item, **kwargs):
         print "Got:", item
         
     dispatcher.connect(catch_item, signal=signals.item_passed)
     
     from scrapy.conf import settings
     settings.overrides['LOG_ENABLED'] = True
     
     from scrapy.crawler import CrawlerProcess
     
     crawler = CrawlerProcess(settings)
     crawler.install()
     crawler.configure()
     
     from alescspider.spiders import *
     spiders = [deputado_spider.DeputadoSpider()]
     #spiders = [presenca_spider.PresencaSpider(), votos_spider.VotosSpider(), deputado_spider.DeputadoSpider()]
     for spider in spiders:
         crawler.queue.append_spider(spider)
     
     print "STARTING ENGINE"
     crawler.start()
     print "ENGINE STOPPED"
     
コード例 #11
0
ファイル: crawlerBlog.py プロジェクト: catsecorg/Scripts
def main():
    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        print "Item Extraido:", item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # definir el spider para el crawler
    crawler.crawl(BloggerSpider())

    # iniciar scrapy
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
コード例 #12
0
ファイル: test_tw3.py プロジェクト: Big-Data/ec2
def test_cp():
    crawlerProcess = CrawlerProcess(scrapy_conf)
    crawlerProcess.install()
    crawlerProcess.configure()    

    crawlerProcess.queue.append_spider(myspider)
    crawlerProcess.start()
コード例 #13
0
ファイル: cmdline.py プロジェクト: Root-nix/scrapy
def execute(argv=None):
    if argv is None:
        argv = sys.argv
    crawler = CrawlerProcess(settings)
    crawler.install()
    inproject = inside_project()
    cmds = _get_commands_dict(inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)
    cmd.set_crawler(crawler)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #14
0
class CrawlerScript():

    def __init__(self):
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
        self.items = []
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def _crawl(self, queue, spider_name):
        spider = self.crawler.spiders.create(spider_name)
        if spider:
            self.crawler.queue.append_spider(spider)
        self.crawler.start()
        self.crawler.stop()
        queue.put(self.items)

    def crawl(self, spider):
        queue = Queue()
        p = Process(target=self._crawl, args=(queue, spider,))
        p.start()
        p.join()
        return queue.get(True)
コード例 #15
0
ファイル: crawlerBlog.py プロジェクト: Adastra-thw/pyHacks
def main():
	"""Rutina principal para la ejecución del Spider"""
	# set up signal to catch items scraped
	from scrapy import signals
	from scrapy.xlib.pydispatch import dispatcher

	def catch_item(sender, item, **kwargs):
		print "Item Extraido:", item
	dispatcher.connect(catch_item, signal=signals.item_passed)

	from scrapy.conf import settings
	settings.overrides['LOG_ENABLED'] = False

	# setup crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)
	crawler.install()
	crawler.configure()

	# definir el spider para el crawler
	crawler.crawl(BloggerSpider())

	# iniciar scrapy
	print "STARTING ENGINE"
	crawler.start()
	print "ENGINE STOPPED"
コード例 #16
0
class Worker(multiprocessing.Process):
    def __init__(self, spider, deckbox_user, deckbox_pass):
        multiprocessing.Process.__init__(self)

        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()

        self.items = []
        self.spider = spider
        self.username = deckbox_user
        self.password = deckbox_pass
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def _crawl(self):
        if self.spider:
            self.crawler.crawl(self.spider(self.username, self.password))
        self.crawler.start()
        self.crawler.stop()

    def crawl(self, spider):
        p = Process(target=self._crawl)
        p.start()
        p.join()
コード例 #17
0
def main():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    options = parse_args()

    dispatcher.connect(catch_item, signal=signals.item_passed)

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = True
    settings.overrides['DEPTH_LIMIT'] = 2


    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    spider = MySpider(input=options.input, output=options.output)
    crawler.queue.append_spider(spider)

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
コード例 #18
0
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.settings import CrawlerSettings
      
    def catch_item(sender, item, **kwargs):
        #log.msg("Got:" + str(item))
        pass
       
    dispatcher.connect(catch_item, signal=signals.item_passed)

    """clean storage"""
    scraperwiki.sqlite.execute("drop table if exists "+spider.name)
    scraperwiki.sqlite.commit()


    from scrapy.crawler import CrawlerProcess

    settings = CrawlerSettings(values=settings)

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)

    #log.start(loglevel='DEBUG')

    crawler.start()
コード例 #19
0
def run_spider(spider, settings):
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start()
    crawler.start()
コード例 #20
0
def run_spider(spider, settings):
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start()
    crawler.start()
コード例 #21
0
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start()
    crawler.start()
コード例 #22
0
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start()
    crawler.start()
コード例 #23
0
def run_tide_scrapy(stationID, startDate, endDate, **kwargs):
    settings.overrides.update({})  # your settings
    crawlerProcess = CrawlerProcess(settings)
    crawlerProcess.install()
    crawlerProcess.configure()
    spider = TideSpider(stationID, startDate, endDate)
    crawlerProcess.crawl(spider)
    try:
        crawlerProcess.start()
    except:
        print "error"
コード例 #24
0
ファイル: runscrapy.py プロジェクト: mvrk/BACKUP
def run_tide_scrapy(stationID, startDate, endDate, **kwargs):
    settings.overrides.update({}) # your settings
    crawlerProcess = CrawlerProcess(settings)
    crawlerProcess.install()
    crawlerProcess.configure()
    spider = TideSpider(stationID, startDate, endDate)
    crawlerProcess.crawl(spider)
    try:
        crawlerProcess.start()
    except:
        print "error"
コード例 #25
0
def run_spider(spider, settings=None):
    """Run a spider instance through the scrapy crawler.

    This function is suitable for standalone scripts.
    """
    crawler = CrawlerProcess(_build_settings(settings))
    crawler.install()
    crawler.configure()
    log.start_from_crawler(crawler)
    crawler.crawl(spider)
    crawler.start()
コード例 #26
0
def run_spider(spider, settings=None):
    """Run a spider instance through the scrapy crawler.

    This function is suitable for standalone scripts.
    """
    crawler = CrawlerProcess(_build_settings(settings))
    crawler.install()
    crawler.configure()
    log.start_from_crawler(crawler)
    crawler.crawl(spider)
    crawler.start()
コード例 #27
0
def run_spider(spider, settings):
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(spider)
    
    log.start()
    # start engine scrapy/twisted
    crawler.start()
コード例 #28
0
ファイル: run_wind.py プロジェクト: mvrk/TEAKWOOD
def run_wind_spider(startDate, endDate, **kwargs):
    wind_crawlerProcess = CrawlerProcess(settings)
    wind_crawlerProcess.install()
    wind_crawlerProcess.configure()
    spider2 = WindSpider("dpia1", sys.argv[1], sys.argv[2])

    wind_crawlerProcess.crawl(spider2)
    try:
        wind_crawlerProcess.start()
        wind_crawlerProcess.stop()
        wind_crawlerProcess.uninstall()
    except Exception as e:
        print e
コード例 #29
0
def run_river_spider(startDate, endDate, **kwargs):
    water_crawlerProcess = CrawlerProcess(settings)
    water_crawlerProcess.install()
    water_crawlerProcess.configure()

    spider = RiverSpider(sys.argv[1], sys.argv[2])
    water_crawlerProcess.crawl(spider)
    try:
        water_crawlerProcess.start()
        water_crawlerProcess.stop()
        water_crawlerProcess.uninstall()
    except Exception as e:
        print e
コード例 #30
0
def runscrapy(stationID, startDate, endDate, **kwargs):
    crawlerProcess = CrawlerProcess(settings)
    crawlerProcess.install()
    crawlerProcess.configure()

    spider = Spider(stationID, startDate, endDate)
    crawlerProcess.crawl(spider)
    try:
        crawlerProcess.start()
        crawlerProcess.stop()
        crawlerProcess.uninstall()
    except Exception as e:
        print e
コード例 #31
0
ファイル: runscrapy.py プロジェクト: mvrk/BACKUP
def runscrapy(stationID, startDate, endDate, **kwargs):
    crawlerProcess = CrawlerProcess(settings)
    crawlerProcess.install()
    crawlerProcess.configure()

    spider = Spider(stationID, startDate, endDate)
    crawlerProcess.crawl(spider)
    try:
        crawlerProcess.start()
        crawlerProcess.stop()
        crawlerProcess.uninstall()
    except Exception as e:
        print e
コード例 #32
0
ファイル: run_wind.py プロジェクト: mvrk/BACKUP
def run_wind_spider(startDate, endDate, **kwargs):
    wind_crawlerProcess = CrawlerProcess(settings)
    wind_crawlerProcess.install()
    wind_crawlerProcess.configure()
    spider2 = WindSpider("dpia1", sys.argv[1], sys.argv[2])

    wind_crawlerProcess.crawl(spider2)
    try:
        wind_crawlerProcess.start()
        wind_crawlerProcess.stop()
        wind_crawlerProcess.uninstall()
    except Exception as e:
        print e
コード例 #33
0
ファイル: test_crawler.py プロジェクト: pferdwurst/parkyou
    def setUp(self):
        crawler = CrawlerProcess(settings)
        crawler.install()
        # what does this do?
        inside_project()
        self.items = []

        self.crawl_cmd = scrapy.commands.crawl.Command() 
        self.crawl_cmd.set_crawler(crawler)

        self.parser = optparse.OptionParser()
        self.crawl_cmd.add_options(self.parser)
        dispatcher.connect(self._item_passed, signals.item_passed)
コード例 #34
0
ファイル: run_water.py プロジェクト: mvrk/BACKUP
def run_water_spider(startDate, endDate, **kwargs):
    water_crawlerProcess = CrawlerProcess(settings)
    water_crawlerProcess.install()
    water_crawlerProcess.configure()
    
    spider = WaterSpider("8735180", sys.argv[1], sys.argv[2])
    water_crawlerProcess.crawl(spider)
    try:
    	water_crawlerProcess.start()
    	water_crawlerProcess.stop()
    	water_crawlerProcess.uninstall()
    except Exception as e:
    	print e
コード例 #35
0
ファイル: cmdline.py プロジェクト: ZhaiQiliang/scrapy
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and "scrapy.conf" in sys.modules:
        from scrapy import conf

        if hasattr(conf, "settings"):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf

        conf.settings = settings
    # ------------------------------------------------------------------

    crawler = CrawlerProcess(settings)
    crawler.install()
    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler="resolve")
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)
    cmd.set_crawler(crawler)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #36
0
def run_crawler(argv=None, settings=None):
    """Run the scrapy crawler bounded to registered spiders.

    This function is suitable for standalone scripts.

    Usage::

        # mimic 'scrapy crawl' command having these two spiders available
        SpiderManager.register(FooSpider)
        SpiderManager.register(BarSpider)

        run_crawler()

    """
    argv = argv or sys.argv
    settings = _build_settings(settings)

    # load spider manager from this module
    settings.overrides.update({
        'SPIDER_MANAGER_CLASS':
        '%s.%s' % (__name__, SpiderManager.__name__),
    })

    crawler = CrawlerProcess(settings)
    crawler.install()

    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter())
    parser.add_option('-l',
                      '--list',
                      action='store_true',
                      help="List available spiders")

    cmd = CrawlCommand()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)

    parser.usage = "%s %s" % (argv[0], cmd.syntax())
    opts, args = parser.parse_args()
    if opts.list:
        settings.defaults.update(ListCommand.default_settings)
        listcmd = ListCommand()
        listcmd.set_crawler(crawler)
        listcmd.run(args, opts)
        sys.exit(listcmd.exitcode)
    else:
        cmdline._run_print_help(parser, cmd.process_options, args, opts)
        cmd.set_crawler(crawler)
        cmdline._run_print_help(parser, cmdline._run_command, cmd, args, opts)
        sys.exit(cmd.exitcode)
コード例 #37
0
ファイル: cmdline.py プロジェクト: weisbeck/403Section
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    crawler = CrawlerProcess(settings)
    crawler.install()
    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)
    cmd.set_crawler(crawler)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #38
0
def main_spider():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    # setting
    setting()

    # set log
    start(logfile='log/spider/spider.log', loglevel='INFO', logstdout=False)
    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(coreSpider())

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print '********************'
    c = coreSpider()
    getinfo = c.get_getinfo()
    print getinfo
    urls = c.get_urls()
    forms = c.get_forms(urls)
    print forms
    print len(c.get_urls())
    print "ENGINE STOPPED"
    # scanner
    h = HTTP()
    a = Attack_XSS(h)
    tmp = a.attack(getinfo, forms)
    print '%%%%%%%%%%%%%%%'

    print 'per XSS start'

    p_xss = Attack_permanentXSS(h)
    p_xss.attack_p(getinfo, forms, tmp)
コード例 #39
0
ファイル: coreSpider.py プロジェクト: lymneu/UnitScan
def main_spider():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    # setting
    setting()
    
    # set log
    start(logfile = 'log/spider/spider.log',loglevel = 'INFO',logstdout = False)
    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(coreSpider())

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print '********************'
    c = coreSpider()
    getinfo = c.get_getinfo()
    print getinfo
    urls = c.get_urls()
    forms = c.get_forms(urls) 
    print forms
    print len(c.get_urls())
    print "ENGINE STOPPED"
    # scanner
    h = HTTP()
    a = Attack_XSS(h)
    tmp = a.attack(getinfo,forms)
    print '%%%%%%%%%%%%%%%'
    
    print 'per XSS start'
    
    p_xss = Attack_permanentXSS(h)
    p_xss.attack_p(getinfo, forms, tmp)
コード例 #40
0
ファイル: helpers.py プロジェクト: pombredanne/scrapyrwiki
def run_spider(spider, settings, loglevel='INFO'):
    """
    Run a spider with given settings
    """
    if 'SENTRY_DSN' in os.environ:
        import scrapy_sentry
        scrapy_sentry.init(os.environ['SENTRY_DSN'])
        settings.overrides.update({
            'SENTRY_SIGNALS': ['spider_error']
        })
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start(loglevel=loglevel)
    crawler.start()
コード例 #41
0
def run_crawler(argv=None, settings=None):
    """Run the scrapy crawler bounded to registered spiders.

    This function is suitable for standalone scripts.

    Usage::

        # mimic 'scrapy crawl' command having these two spiders available
        SpiderManager.register(FooSpider)
        SpiderManager.register(BarSpider)

        run_crawler()

    """
    argv = argv or sys.argv
    settings = _build_settings(settings)

    # load spider manager from this module
    settings.overrides.update({
        'SPIDER_MANAGER_CLASS': '%s.%s' % (__name__, SpiderManager.__name__),
    })

    crawler = CrawlerProcess(settings)
    crawler.install()

    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter())
    parser.add_option('-l', '--list', action='store_true',
                      help="List available spiders")

    cmd = CrawlCommand()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)

    parser.usage = "%s %s" % (argv[0], cmd.syntax())
    opts, args = parser.parse_args()
    if opts.list:
        settings.defaults.update(ListCommand.default_settings)
        listcmd = ListCommand()
        listcmd.set_crawler(crawler)
        listcmd.run(args, opts)
        sys.exit(listcmd.exitcode)
    else:
        cmdline._run_print_help(parser, cmd.process_options, args, opts)
        cmd.set_crawler(crawler)
        cmdline._run_print_help(parser, cmdline._run_command, cmd, args, opts)
        sys.exit(cmd.exitcode)
コード例 #42
0
def run_tests(spider, output_file, settings):
    """
    Helper for running test contractors for a spider and output an
    XUnit file (for CI)

    For using offline input the HTTP cache is enabled
    """

    settings.overrides.update({
        "HTTPCACHE_ENABLED": True,
        "HTTPCACHE_EXPIRATION_SECS": 0,
    })

    crawler = CrawlerProcess(settings)

    contracts = build_component_list(
        crawler.settings['SPIDER_CONTRACTS_BASE'],
        crawler.settings['SPIDER_CONTRACTS'],
    )

    xunit = Xunit()
    xunit.enabled = True
    xunit.configure(AttributeDict(xunit_file=output_file), Config())
    xunit.stopTest = lambda *x: None

    check = CheckCommand()
    check.set_crawler(crawler)
    check.settings = settings
    check.conman = ContractsManager([load_object(c) for c in contracts])
    check.results = xunit
    # this are specially crafted requests that run tests as callbacks
    requests = check.get_requests(spider)

    crawler.install()
    crawler.configure()
    crawler.crawl(spider, requests)
    log.start(loglevel='DEBUG')

    # report is called when the crawler finishes, it creates the XUnit file
    report = lambda: check.results.report(check.results.error_report_file)
    dispatcher.connect(report, signals.engine_stopped)

    crawler.start()
コード例 #43
0
ファイル: craigs_view.py プロジェクト: wennho/bezar
class CrawlerScript():
    def __init__( self ):
        self.crawler = CrawlerProcess( Settings() )
        self.crawler.install()
        self.crawler.configure()
    def _crawl( self, queue, search ):
        log.start( loglevel = log.DEBUG )
        current_spider = CraigslistSpider()
        if search:
            current_spider.set_search_url( search )
        self.crawler.crawl( current_spider )
        self.crawler.start()
        self.crawler.stop()
        queue.put( current_spider.get_object_list() )
    def crawl( self, search = "" ):
        q = Queue()
        p = Process( target = self._crawl, args = ( q, search ) )
        p.start()
        p.join()
        return q.get()
コード例 #44
0
ファイル: gettheros.py プロジェクト: dekoza/mtgpl
class CrawlerScript:
    def __init__(self):
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, "crawler"):
            self.crawler.install()
        self.crawler.configure()
        self.items = []
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def crawl(self, queue, spider_name):
        spider = self.crawler.spiders.create(spider_name)
        # if spider:
        #     self.crawler.queue.append_spider(spider)
        # self.crawler.start()
        # self.crawler.stop()
        # queue.put(self.items)
        return spider.crawl()
コード例 #45
0
class CrawlerScript():

    def __init__(self, spider, results):
        self.results = results
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.results.put(self.items)
コード例 #46
0
class DomainCrawlerScript():

    def __init__(self):
        self.crawler = CrawlerProcess(settings)
        self.crawler.install()
        self.crawler.configure()

    def _crawl(self, domain_pk):
        domain = Domain.objects.get(
            pk = domain_pk,
        )
        urls = []
        for page in domain.pages.all():
            urls.append(page.url())
        self.crawler.crawl(DomainSpider(urls))
        self.crawler.start()
        self.crawler.stop()

    def crawl(self, domain_pk):
        p = Process(target=self._crawl, args=[domain_pk])
        p.start()
        p.join()
コード例 #47
0
ファイル: seocrawler.py プロジェクト: JoeCotellese/seocrawler
def main(parser):
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    #shut off logging to the console
    def catch_item(sender, item, **kwargs):
        pass
        
    dispatcher.connect(catch_item, signal=signals.item_passed)

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    settings.overrides['FEED_URI'] = 'stdout.csv'
    settings.overrides['FEED_FORMAT'] = 'csv'
    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
 
    args,opts = parser.parse_args()
    # schedule spider
    ext = tldextract.extract(opts[0])
    allowed_domain =  '.'.join(ext[1:])
    spider = SeoSpider(opts[0],allowed_domain)
    #spider.set_url('http://www.newcustomerworkshop.com')
    
    crawler.crawl(spider)

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
コード例 #48
0
ファイル: scheduler.py プロジェクト: HackerEcology/qrator
class CrawlerWorker(multiprocessing.Process):
    
    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue

        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)
    
    def _item_passed(self, item):
        self.items.append(item)

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
コード例 #49
0
def _run_crawl_process(**kwargs):
    #log.start must be explicitly called
    log.start(loglevel=getattr(django_settings, 'SCRAPY_LOG_LEVEL', 'INFO'))

    # region How to run a crawler in-process
    # examples on how to get this stuff:
    # http://stackoverflow.com/questions/14777910/scrapy-crawl-from-script-always-blocks-script-execution-after-scraping?lq=1
    # http://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script
    # http://stackoverflow.com/questions/7993680/running-scrapy-tasks-in-python
    # http://stackoverflow.com/questions/15564844/locally-run-all-of-the-spiders-in-scrapy
    # https://groups.google.com/forum/#!topic/scrapy-users/d4axj6nPVDw
    # endregion

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    spider = crawler.spiders.create(kwargs['spider'], **kwargs)
    crawler.crawl(spider)

    log.msg('Spider started...')
    crawler.start()
    log.msg('Spider stopped.')
    crawler.stop()
コード例 #50
0
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.settings import CrawlerSettings
      
    def catch_item(sender, item, **kwargs):
        #log.msg("Got:" + str(item))
        pass
       
    dispatcher.connect(catch_item, signal=signals.item_passed)

    from scrapy.crawler import CrawlerProcess

    settings = CrawlerSettings(values=settings)

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)

    #log.start(loglevel='DEBUG')

    crawler.start()
コード例 #51
0
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.settings import CrawlerSettings
      
    def catch_item(sender, item, **kwargs):
        #log.msg("Got:" + str(item))
        pass
       
    dispatcher.connect(catch_item, signal=signals.item_passed)

    from scrapy.crawler import CrawlerProcess

    settings = CrawlerSettings(values=settings)

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)

    #log.start(loglevel='DEBUG')

    crawler.start()
コード例 #52
0
ファイル: scraper_test.py プロジェクト: muyiyangyang/DDS
class ScraperTest(TestCase):

    SERVER_URL = 'http://*****:*****@href',
                                  from_detail_page=False)
        self.se_url.save()
        self.se_desc = ScraperElem(
            scraped_obj_attr=self.soa_desc,
            scraper=self.scraper,
            x_path=u'//div/div[@class="description"]/text()',
            from_detail_page=True,
            mandatory=False)
        self.se_desc.save()

        self.sched_rt = SchedulerRuntime()
        self.sched_rt.save()

        self.event_website = EventWebsite(
            pk=1,
            name=u'Event Website',
            scraper=self.scraper,
            url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'),
            scraper_runtime=self.sched_rt,
        )
        self.event_website.save()

        settings.overrides['ITEM_PIPELINES'] = [
            'dynamic_scraper.pipelines.DjangoImagesPipeline',
            'dynamic_scraper.pipelines.ValidationPipeline',
            'scraper.scraper_test.DjangoWriterPipeline',
        ]

        settings.overrides['IMAGES_STORE'] = os.path.join(
            self.PROJECT_ROOT, 'imgs')
        settings.overrides['IMAGES_THUMBS'] = {
            'small': (170, 170),
        }

        self.crawler = CrawlerProcess(settings)
        self.crawler.install()
        self.crawler.configure()

        for name, signal in vars(signals).items():
            if not name.startswith('_'):
                dispatcher.connect(self.record_signal, signal)

    def tearDown(self):
        pass