コード例 #1
0
ファイル: foxy.py プロジェクト: claudioharu/MngX
def create_crawler(spider):
    '''Setups item signal and run the spider'''
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
         print "Got:", item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(spider)

    return crawler
コード例 #2
0
ファイル: queen.py プロジェクト: Leon-Wulfgang/myCrawler
    def service_sis(self):
        process = CrawlerProcess({
            'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        })

        process.crawl(worker.Worker)
        process.start()  # the script will block here until the crawling is finished
コード例 #3
0
ファイル: decc.py プロジェクト: CharlesNie/DECC
def main(argv):

	try:
		opts, args = getopt.getopt(argv, "ch:t:s:", ['title=', 'section='])
	except getopt.GetoptError:
		print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>'
			sys.exit()
		elif opt == '-c':
			# start crawling article here
			print "crawling"
			process = CrawlerProcess(get_project_settings())
			process.crawl(BBCArticleSpider)
			process.start()
		elif opt in  ('-t', '--title'):
			print "search by title"
			# start searching article by title
			results = BBCArticleItem.fetch_by_title(arg)
			for result in results:
				print result
		elif opt in ('-s', '--section'):
			print "search by section"
			# start searching article by section
			results = BBCArticleItem.fetch_by_section(arg)
			for result in results:
				print result
コード例 #4
0
ファイル: run.py プロジェクト: CkuT/crawlers
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback, scrapy_settings=None):
    """
    Launch crawl job for JobSpider class
    :param scrapy_settings: dict of setting merged with CrawlerProcess default settings
    :param debug: (bool) Activate or disable debug
    :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error)
    :param connector: Connector instance
    :param spiders_classes: JobSpider class list
    :return: spider instance
    """
    if debug:
        dispatcher.connect(spider_error_callback, signals.spider_error)

    settings = {
        'ITEM_PIPELINES': {
            'pyjobs_crawlers.pipelines.RecordJobPipeline': 1,
        },
        'connector': connector,
        'LOG_ENABLED': False,
        'DOWNLOAD_DELAY': 1 if not debug else 0,
    }
    if scrapy_settings:
        settings.update(scrapy_settings)

    process = CrawlerProcess(settings)

    for spider_class in spiders_classes:
        process.crawl(spider_class, debug=debug)

    spiders = []
    for crawler in list(process.crawlers):
        spiders.append(crawler.spider)
    process.start()

    return spiders
コード例 #5
0
ファイル: crawlerBlog.py プロジェクト: Adastra-thw/pyHacks
def main():
	"""Rutina principal para la ejecución del Spider"""
	# set up signal to catch items scraped
	from scrapy import signals
	from scrapy.xlib.pydispatch import dispatcher

	def catch_item(sender, item, **kwargs):
		print "Item Extraido:", item
	dispatcher.connect(catch_item, signal=signals.item_passed)

	from scrapy.conf import settings
	settings.overrides['LOG_ENABLED'] = False

	# setup crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)
	crawler.install()
	crawler.configure()

	# definir el spider para el crawler
	crawler.crawl(BloggerSpider())

	# iniciar scrapy
	print "STARTING ENGINE"
	crawler.start()
	print "ENGINE STOPPED"
コード例 #6
0
ファイル: mult.py プロジェクト: odinplus/lnkdn_scrpr
class CrawlerScript():

    def __init__(self):
        settings = get_project_settings()
        settings.set('LOG_ENABLED', False, priority='cmdline')
        #settings.overrides['LOG_ENABLED'] = False
        self.crawler = CrawlerProcess(settings)
        self.items = []
        SignalManager(dispatcher.Any).connect(self._item_passed, signal=signals.item_scraped)

    def _item_passed(self,item,response,spider):
        self.items.append(item)

    def _crawl(self, q, queue):
        self.crawler.crawl(BingSpider, q=q)
        self.crawler.start()
        self.crawler.stop()
        queue.put(self.items)

    def crawl(self, q):
        queue = Queue()
        p = Process(target=self._crawl, args=[q, queue])
        p.start()
        p.join()
        return queue.get(True)
コード例 #7
0
ファイル: see_whats_going_on.py プロジェクト: georgi0u/mgrok
def get_scraped_sites_data():
    """Returns output for venues which need to be scraped."""
    class RefDict(dict):
        """A dictionary which returns a reference to itself when deepcopied."""
        def __deepcopy__(self, memo):
            return self

    # Hack: we pass a dictionary which can't be deep-copied into the settings
    # so as to _return_ the scraper output. As far as I can tell, this is the
    # only way to return the scraper output to the script itself.
    output = RefDict()

    settings = Settings({
        'LOG_ENABLED': False,
        'ITEM_PIPELINES': {
            'mgrok.pipelines.JsonWriterPipeline': 1
            },
        'PIPELINE_OUTPUT': output,
        'USER_AGENT': 'Chrome/41.0.2228.0'
        })

    crawler_process = CrawlerProcess(settings)
    for spider in SCRAPY_SPIDERS:
        crawler_process.crawl(spider)

    crawler_process.start()

    return output
コード例 #8
0
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.settings import CrawlerSettings
      
    def catch_item(sender, item, **kwargs):
        #log.msg("Got:" + str(item))
        pass
       
    dispatcher.connect(catch_item, signal=signals.item_passed)

    """clean storage"""
    scraperwiki.sqlite.execute("drop table if exists "+spider.name)
    scraperwiki.sqlite.commit()


    from scrapy.crawler import CrawlerProcess

    settings = CrawlerSettings(values=settings)

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)

    #log.start(loglevel='DEBUG')

    crawler.start()
コード例 #9
0
ファイル: scrape.py プロジェクト: benbp/showfinder
def run(urls, city):
    process = CrawlerProcess()
    spiders = [make_spider(artist, url, city) for artist, url in urls]
    for spider_cls in spiders:
        process.crawl(spider_cls)
    # the script will block here until the crawling is finished
    process.start()
コード例 #10
0
ファイル: web_run.py プロジェクト: shanyue-video/video_scrapy
 def get(self):
     while True:
         process = CrawlerProcess(get_project_settings())
         process.crawl('iqiyi')
         process.start()
         time.sleep(3000)
     self.finish()
コード例 #11
0
    def handle(self, *args, **options):
        # It would be better to pass this in as a parameter to PayoutSpider
        global start_date
        start_date = datetime.datetime(2015, 1, 1, tzinfo=pytz.UTC)

        delete = options.get('delete')
        delete_all = options.get('delete_all')
        retrieve_all = options.get('retrieve_all')

        previous_payout = None
        previous_payouts = codementor_models.Payout.objects.all().order_by('-date')
        if delete_all or (delete and previous_payouts.count() == 0):
            codementor_models.Review.objects.all().delete()
            codementor_models.Session.objects.all().delete()
            codementor_models.Payout.objects.all().delete()
            codementor_models.Payment.objects.all().delete()
        elif delete:
            previous_payout = previous_payouts[0]
            codementor_models.Review.objects.filter(date__gt=start_date).delete()
            codementor_models.Session.objects.filter(started_at__gt=start_date).delete()
            previous_payout.delete()
            codementor_models.Payment.objects.filter(payout__isnull=True).delete()

        if not retrieve_all and previous_payout:
            start_date = previous_payout.date

        process = CrawlerProcess({
            'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        })

        process.crawl(PayoutSpider)
        process.start()
コード例 #12
0
def spiderCrawl(bandname):
   createLink(bandname)
   settings = get_project_settings()
   settings.set('USER_AGENT','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)')
   process = CrawlerProcess(settings)
   process.crawl(MySpider)
   process.start()
コード例 #13
0
    def run(self):
        settings = get_project_settings()
        process = CrawlerProcess(settings)

        process.crawl('stackoverflow',
                      )
        process.start()
コード例 #14
0
def scrape(spider):
    with transaction.atomic(), reversion.create_revision():
        process = CrawlerProcess(DEFAULT_CRAWLER_OPTIONS)
        process.crawl(spider)
        # the script will block here until the crawling is finished
        process.start()
    return
コード例 #15
0
ファイル: crawler.py プロジェクト: nw4869/flask-scrapy
class MySpiderProcess1(scrapy.Spider):
    def __init__(self, name, urls):
        self.name = name
        self.start_urls = urls
        scrapy.Spider.__init__(self)

    def parse(self, response):
        print('parse response')

    def _crawl(self):
        settings = Settings()
        settings.set('ITEM_PIPELINES', {
            'app.pipelines.JsonWriterPipeline': 300
        })
        self.process = CrawlerProcess(settings)
        self.process.crawl(self, self.name, self.start_urls)
        self.process.start()
        # self.process.stop()
        # self.process.join()

    def start(self):
        p = Process(target=self._crawl)
        p.start()
        p.join()

    #
    # def start(self):
    #     self._crawl()

    def stop(self):
        self.process.stop()
コード例 #16
0
def main():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    
    
    def catch_item(sender, item, **kwargs):
        print "Got:", item

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(MySpider())

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
コード例 #17
0
ファイル: scr.py プロジェクト: JINDALG/Roofpik_scrapy
def magic():

	process = CrawlerProcess(get_project_settings())

	# 'followall' is the name of one of the spiders of the project.
	process.crawl('magic')
	process.start() # the script will block here until the crawling is fini
コード例 #18
0
ファイル: main.py プロジェクト: dagrooms52/TabCrawler
def main(tabLink):
    if(tabLink.find("ultimate-guitar.com")):
        tabSpider = Spiders.Ultimate(tabLink)
    elif(tabLink.find("guitartabs.cc")):
        tabSpider = Spiders.TabCC(tabLink)
    else:
        print("Domain name not supported.")
        return

    # Make a process to instantiate a Ultimate spider with the given
    # arguments and make it crawl the link
    process = CrawlerProcess(get_project_settings())
    process.crawl(tabSpider, link=tabLink)
    process.start()

    # Link has been scraped, now process it
    tree = xmltree.parse(tabs.pipelines.filename)
    root = tree.getroot()
    value = root[0][0][0]
    rawTab = value.text

    if("\M" in rawTab):
        rawTab = parsefuncs.removeLineEndings(rawTab)

    cleanTab = parsefuncs.parseTab(rawTab)

    print("Clean tab is:")
    count = 0
    for line in cleanTab:
        count += 1
        print line
        if(count % 6 == 0):
            print(" ")
コード例 #19
0
def main():
    """Index alexa demographics
    """

    engine = db_connect()
    Session = sessionmaker(bind=engine)
    session = Session()

    settings = get_project_settings()
    settings.set('ITEM_PIPELINES',
                 {'demographic_scraper.demographic_scraper.pipelines.WebsiteDemographicPipeline': 300})
    settings.set('EXTENSIONS',
                 {'scrapy.telnet.TelnetConsole': None,})


    process = CrawlerProcess(settings)
    for website in session.query(WebsitesContent).all():
        demographic = list(session.query(Websites).filter_by(link=website.link))
        if len(demographic) is 0:
            url = website.link
            print website.link
            AlexaSpider.name = url
            process.crawl(AlexaSpider, url=url, db_session=session)
    process.start()
    process.stop()

    session.close()
コード例 #20
0
ファイル: tasks.py プロジェクト: sdlirjc/algorithm
def _crawl(path=None):
     crawl = CrawlerProcess({
         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
     })
     crawl.crawl(ProvinceSpider)
     crawl.start()
     crawl.stop()
コード例 #21
0
ファイル: spider_start.py プロジェクト: NLPScott/bdbk-kb
    def handle(self, *args, **options):
        setting = {
            'USER_AGENT': options['user_agent'],
            'DOWNLOAD_DELAY': options['download_delay'],
            'LOG_FILE': settings.SCRAPY_LOG_FILE,
            'LOG_LEVEL': settings.SCRAPY_LOG_LEVEL,
        }

        if options['proxy_list']:
            try:
                f = open(options['proxy_list'])
            except IOError as e:
                raise CommandError('cannot open proxy list file for read')

            # Retry many times since proxies often fail
            setting['RETRY_TIMES'] = 10
            # Retry on most error codes since proxies fail for different reasons
            setting['RETRY_HTTP_CODES'] = [500, 503, 504, 400, 403, 404, 408]
            setting['DOWNLOADER_MIDDLEWARES'] = {
                'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
                'spider.randomproxy.RandomProxy': 100,
                'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
            }
            setting['PROXY_LIST'] = options['proxy_list']

        process = CrawlerProcess(setting)

        process.crawl(BaiduSpider)
        process.start()
コード例 #22
0
ファイル: cli.py プロジェクト: Lukas0907/feeds
def crawl(ctx, spiders, stats):
    """
    Crawl one or many or all pages.

    What spider(s) to run is determined in the following order:

      1. Spider(s) given as argument(s)

      2. Spider(s) specified in the configuration file

    Note that if a spider is given as an argument, the spiders in the
    configuration file are ignored. All available spiders will be used to
    crawl if no arguments are given and no spiders are configured.
    """
    settings = ctx.obj["settings"]
    if stats:
        settings.set("STATS_CLASS", "scrapy.statscollectors.MemoryStatsCollector")

    # Start a new crawler process.
    process = CrawlerProcess(settings)
    spiders = spiders_to_crawl(process, spiders)
    if not spiders:
        logger.error("Please specify what spiders you want to run!")
    else:
        for spider in spiders:
            logger.info("Starting crawl of {} ...".format(spider))
            process.crawl(spider)

    process.start()

    if settings.getbool("HTTPCACHE_ENABLED"):
        run_cleanup_cache(settings)
コード例 #23
0
ファイル: casaLibro.py プロジェクト: flubbers/AZScraping
def scrapeando():
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        """Rellenamos la BD"""
        for i in enumerate(item.items()):
            x = i[0]
            query = "INSERT INTO book (Nombre ,Autor, Editorial ,Fecha, Precio, Link) VALUES ("+decodifica(item['Nombre'][x])+","+decodifica(item['Autor'][x])+","+decodifica(item['Editorial'][x])+","+decodifica(item['Fecha'][x])+","+decodifica(item['Precio'][x])+","+decodifica("http://www.casadellibro.com"+item['Link'][x])+");"
            db.micursor.execute(query)
            db.conexion.commit()
        print item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    book = BookSpider()
    book.busqueda=unicode(search.getbusqueda())
    crawler.crawl(book)
    print "Start scraping to la Casa del Libro"
    crawler.start()
    print "End scraping to la Casa del Libro"
    crawler.stop()
コード例 #24
0
ファイル: uiCompare.py プロジェクト: AugustLONG/mcubed
def ScrapeSite():
    db = 'crunchbase_startups'
    sitedomain = raw_input("Enter site domain: ") # get user input
    sitedomain = parse_base_url(sitedomain) # clean url
    
    sql = 'SELECT text FROM {} WHERE siteurl = %s'.format(db)
    
    cur.execute(sql, sitedomain)
    sitetext = cur.fetch()
    
    if sitetext != '': # what does an empty ping return?
        print 'Site already scraped.'
        return sitetext
    
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'ITEM_PIPELINES': {'pipelines.UserInputPipeline': 100},
        'DEPTH_LIMIT': 2,
        'DOWNLOAD_HANDLERS': {'s3': None,}
        ,'LOG_LEVEL': 'INFO'
    })
    
    process.crawl(SoloSpider, domain = sitedomain)
    process.start()
    
    # presumably finished here - pull newly loaded sitetext for domain
    
    cur.execute(sql, sitedomain)
    return cur.fetch()
コード例 #25
0
ファイル: ScapyTest.py プロジェクト: ohansrud/StockUtils
    def Test_Scapy(self):
        spider = FtpSpider()

        process = CrawlerProcess({"USER_AGENT": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"})

        process.crawl(spider)
        process.start()
コード例 #26
0
ファイル: task.py プロジェクト: ohhdemgirls/od-database
    def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str):
        busy.value = 1
        if os.path.exists("data.json"):
            os.remove("data.json")
        print("Started crawling task")
        process = CrawlerProcess(get_project_settings())
        process.crawl("od_links", base_url=website.url)
        process.start()
        print("Done crawling")

        self.db.import_json("data.json", website)
        os.remove("data.json")
        print("Imported in SQLite3")

        if post_id:
            # Reply to post
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment({"": stats}, website.id)
            print(comment)
            if "total_size" in stats and stats["total_size"] > 10000000:
                post = self.reddit_bot.reddit.submission(post_id)
                self.reddit_bot.reply(post, comment)
                pass
            else:
                self.reddit_bot.log_crawl(post_id)

        elif comment_id:
            # Reply to comment
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id)
            print(comment)
            reddit_comment = self.reddit_bot.reddit.comment(comment_id)
            self.reddit_bot.reply(reddit_comment, comment)
        busy.value = 0
        print("Done crawling task")
コード例 #27
0
ファイル: tasks.py プロジェクト: falquaddoomi/gmat_collector
def runSpiderProcess(spider_cls, *args, **kwargs):
    """
    Helper method that starts a spider with the given init arguments, waits for it to complete, and returns the
    items it yielded in a list.
    :param spider_cls: the spider class to run
    :param args: the indexed arguments to the spider
    :param kwargs: the keyword arguments to the spider
    :return: a list of items yielded by the spider
    """
    process = CrawlerProcess()
    process.crawl(spider_cls, *args, **kwargs)

    final_result = []

    def _nab_item(item):
        # FIXME: this silly dance of encoding and decoding is to prevent scrapy items from being returned to celery
        # FIXME: celery can't serialize them, so it throws a rather opaque error, but it's fine with lists and dicts
        final_result.append(json.loads(scrapy_encoder.encode(item)))

    for crawler in process.crawlers:
        crawler.signals.connect(_nab_item, item_scraped)

    process.start()
    process.stop()

    return final_result
コード例 #28
0
ファイル: exporterfromlist.py プロジェクト: arapidhs/scry
	def __init__(self, titlesfile = None, platform = None, region = None):

		# set default encoding to utf8 for parsing and logging
		# utf-8 characters in console and files
		#
		reload(sys)
		sys.setdefaultencoding('utf8')
        
		configure_logging(install_root_handler=False)
		logging.basicConfig(
			filename='export.log',
			filemode = 'a',
			format='%(levelname)s: %(message)s',
			level=logging.INFO
		)
                				
		# identify platform
		#
		self.platform = platform
		if self.platform is None:
			logging.error('No platform found! Pass it as an argument.')
			return
		else:			
			platformId = platforms.getId(self.platform)
			if platformId is None:
				logging.error('Platform ' + self.platform + ' not supported.')
				return
						
		self.titlesfile = titlesfile
		self.region = region		
		if self.region is None:
			self.region = "Worldwide"
		
		if titlesfile:		
		
			titles = []
			urls = []
			
			with open( self.titlesfile ) as f:
				titles = f.read().splitlines()
				
			for title in titles:
				logging.debug('Submitting title:' + title )
				urls.append(
					'http://mobygames.com/search/quick' +
					'?q=' + title +
					'&p=' + platformId +
					'&search=Go'
					'&sFilter=1'
					'&sG=on'
					'&search_title=' + urllib.quote( title ) + 
					'&search_platform=' + urllib.quote(self.platform) +
					'&search_region=' + urllib.quote(self.region)
				)
				
			process = CrawlerProcess(get_project_settings())
			process.crawl(MobygamesSpider, start_urls=urls)
			process.start()									
		else:
			logging.warning('No file.')
コード例 #29
0
ファイル: news_flash_crawl.py プロジェクト: hasadna/anyway
def news_flash_crawl(rss_link, site_name, maps_key):
    id_flash = get_latest_id_from_db() + 1
    latest_date = get_latest_date_from_db()
    d = feedparser.parse(rss_link)
    process = CrawlerProcess()
    for entry in d.entries[::-1]:
        entry_parsed_date = datetime.strptime(entry.published[:-6], '%a, %d %b %Y %H:%M:%S')
        entry_parsed_date = entry_parsed_date.replace(tzinfo=None)
        if (latest_date is not None and entry_parsed_date > latest_date) or latest_date is None:
            news_item = {'id_flash': id_flash, 'date_parsed': entry_parsed_date, 'title': entry.title,
                         'link': entry.links[0].href, 'date': entry.published, 'location': '', 'lat': 0, 'lon': 0}
            if (u'תאונ' in entry.title and u'תאונת עבודה' not in entry.title and u'תאונות עבודה' not in entry.title)\
                    or ((u'רכב' in entry.title or u'אוטובוס' in entry.title or u"ג'יפ" in entry.title
                         or u'משאית' in entry.title or u'קטנוע'
                         in entry.title or u'אופנוע' in entry.title or u'אופניים' in entry.title or u'קורקינט'
                         in entry.title or u'הולך רגל' in entry.title or u'הולכת רגל' in entry.title
                         or u'הולכי רגל' in entry.title) and
                        (u'נפגע' in entry.title or u'פגיע' in entry.title or
                         u'נפצע' in entry.title or u'פציע' in entry.title or u'התנגש' in entry.title or u'התהפך'
                         in entry.title or u'התהפכ' in entry.title)):
                news_item['accident'] = True
            else:
                news_item['accident'] = False
            if site_name == 'ynet':
                news_item['source'] = 'ynet'
                process.crawl(YnetFlashScrap, entry.links[0].href, news_item=news_item, maps_key=maps_key)
            id_flash = id_flash + 1
    process.start()
コード例 #30
0
ファイル: run.py プロジェクト: algoo/crawlers
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback):
    """
    Launch crawl job for JobSpider class
    :param debug: (bool) Activate or disable debug
    :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error)
    :param connector: Connector instance
    :param spiders_classes: JobSpider class list
    :return: spider instance
    """
    if debug:
        dispatcher.connect(spider_error_callback, signals.spider_error)

    process = CrawlerProcess({
        'ITEM_PIPELINES': {
            'pyjobs_crawlers.pipelines.RecordJobPipeline': 1,
        },
        'connector': connector,
        'LOG_ENABLED': False
    })

    for spider_class in spiders_classes:
        process.crawl(spider_class)

    spiders = []
    for crawler in list(process.crawlers):
        spiders.append(crawler.spider)
    process.start()

    return spiders
コード例 #31
0
from wikipedia.spiders import WikipediaSpider
from scrapy.crawler import CrawlerProcess
import networkx as nx
import matplotlib.pyplot as plt
import urllib.parse

if __name__ == "__main__":
    crawl_depth = 2
    process = CrawlerProcess({
        'LOG_LEVEL': 'ERROR',
        'DEPTH_LIMIT': crawl_depth
    })
    process.crawl(WikipediaSpider)
    spider = next(iter(process.crawlers)).spider
    spider.max_items_per_page = 5
    spider.max_crawl_depth = crawl_depth
    process.start()

    for pm in spider.linked_pages:
        print(pm.depth, pm.link, pm.child_link)
    print("-" * 80)

    g = nx.Graph()

    nodes = {}
    edges = {}

    for pm in spider.linked_pages:
        if pm.title not in nodes:
            nodes[pm.title] = pm
            g.add_node(pm.title)
コード例 #32
0
from scrapy.crawler import CrawlerProcess
from news.spiders.adevaru_spider import AdevarulSpider
from news.spiders.hotnews_spider import HotnewsSpider
from news.spiders.agerpress_spider import AgerpressSpider
from news.spiders.digi_spider import DigiSpider
from news.spiders.tvr_spider import TVRSpider
from news.spiders.protv_spider import ProTVSpider
from news.spiders.realitatea_spider import RealitateaSpider
from scrapy.utils.project import get_project_settings

process = CrawlerProcess(get_project_settings())

process.crawl(AdevarulSpider)
process.crawl(RealitateaSpider)
process.crawl(ProTVSpider)
process.crawl(HotnewsSpider)
process.crawl(AgerpressSpider)
process.crawl(DigiSpider)
process.crawl(TVRSpider)

process.start()
コード例 #33
0
import asyncio

from twisted.internet import asyncioreactor
asyncioreactor.install(asyncio.get_event_loop())

import scrapy
from scrapy.crawler import CrawlerProcess


class NoRequestsSpider(scrapy.Spider):
    name = 'no_request'

    def start_requests(self):
        return []


process = CrawlerProcess(settings={
    "TWISTED_REACTOR":
    "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
})
process.crawl(NoRequestsSpider)
process.start()
コード例 #34
0
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
        'ITEM_PIPELINES': {
            'pipelines.SaveUserReviewPipeline': 300
        },
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy_splash.SplashCookiesMiddleware':
            723,
            'scrapy_splash.SplashMiddleware':
            725,
            'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':
            810,
        },
        'POSTGRES_HOST': 'localhost',
        'POSTGRES_PORT': '25432',
        'POSTGRES_DB': 'mob',
        'POSTGRES_USER': '******',
        'POSTGRES_PASSWORD': '******'
    })

if len(sys.argv) == 1:
    scheduler = TwistedScheduler()
    scheduler.add_job(process.crawl,
                      'interval',
                      args=[UserReviewSpider, lambda: start_objs()],
                      seconds=30)
    scheduler.start()
    process.start(False)
else:
    process.crawl(UserReviewSpider, lambda: start_objs())
    process.start()
コード例 #35
0
ファイル: program.py プロジェクト: swishcloud/spider-51job
from tutorial.spiders.SpiderPost import SpiderPost
from scrapy.crawler import CrawlerProcess
from scrapy import *

process = CrawlerProcess({'USER_AGENT': 'bigyasuo/qq-1801041646'})
csharp = "https://search.51job.com/list/040000,000000,0000,00,9,99,c%2523,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
dotnet = "https://search.51job.com/list/040000,000000,0000,00,9,99,.net,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
shanghai_csharp = "https://search.51job.com/list/020000,000000,0000,00,9,99,c%2523,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
shanghai_dotnet = "https://search.51job.com/list/020000,000000,0000,00,9,99,.net,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
# urls=[csharp,dotnet]
urls = [shanghai_csharp, shanghai_dotnet]
keys = ['c#', '.net']
process.crawl(SpiderPost, urls, 10000, keys)
process.start()  # the script will block here until the crawling is finished
コード例 #36
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description=__name__)
    # Specify what type of crawl we want to do
    crawl_type = parser.add_mutually_exclusive_group(required=True)
    crawl_type.add_argument("--site", "-s", help="Name of site to crawl.")
    crawl_type.add_argument("--all",
                            "-a",
                            action="store_true",
                            help="Crawl all sites.")
    crawl_type.add_argument(
        "--list",
        "-l",
        help=
        "CSV file of URLs to crawl with an 'article_url' column and a 'site name' column."
    )
    # General options
    parser.add_argument(
        "--max_articles",
        "-n",
        type=int,
        default=0,
        help="Maximum number of articles to process from each site.")
    parser.add_argument("--exporter",
                        "-e",
                        default="file",
                        choices=["file", "blob"],
                        help="Article export method.")
    parser.add_argument("--no-digest",
                        action="store_true",
                        help="Disable content digests.")
    parser.add_argument("--no-index",
                        action="store_true",
                        help="Disable node indexes.")
    args = parser.parse_args()

    # Set up logging
    configure_logging()
    logging.getLogger("azure.storage.common.storageclient").setLevel(
        logging.ERROR)
    logging.getLogger("sqlalchemy").setLevel(logging.ERROR)

    # Load crawler settings and apply local overrides
    settings = get_project_settings()
    settings.update({
        'ARTICLE_EXPORTER': args.exporter,
        'CONTENT_DIGESTS': (not args.no_digest),
        'NODE_INDEXES': (not args.no_index),
    })
    # Apply an item limit if specified
    if args.max_articles:
        settings.update({'CLOSESPIDER_ITEMCOUNT': args.max_articles})

    # Set up a crawler process
    process = CrawlerProcess(settings)

    # Load crawler configurations for all sites
    site_configs = yaml.load(pkg_resources.resource_string(
        __name__, "site_configs.yml"),
                             Loader=yaml.FullLoader)
    article_override_lists = yaml.load(pkg_resources.resource_string(
        __name__, "article_override_lists.yml"),
                                       Loader=yaml.FullLoader)
    for site_name in article_override_lists:
        site_configs[site_name][
            "article_override_list"] = article_override_lists[site_name]

    # Crawl a single site
    # -------------------
    if args.site:
        # Create a dynamic spider class and register it with the crawler
        spider_class = dynamic_spider_class(site_configs[args.site],
                                            args.max_articles)
        process.crawl(spider_class, config=site_configs[args.site])

    # Crawl all sites
    # ---------------
    elif args.all:
        for site_name in site_configs:
            # Create a dynamic spider class and register it with the crawler
            spider_class = dynamic_spider_class(site_configs[site_name],
                                                args.max_articles)
            process.crawl(spider_class, config=site_configs[site_name])

    # Crawl all URLs from a CSV file
    # ------------------------------
    elif args.list:
        # Load articles from CSV into a dictionary
        article_urls = defaultdict(list)
        with open(args.list, "r") as csvfile:
            dialect = csv.Sniffer().sniff(csvfile.read(50))
            csvfile.seek(0)
            reader = csv.DictReader(csvfile, dialect=dialect)
            if not all(
                [f in reader.fieldnames
                 for f in ["article_url", "site_name"]]):
                raise ValueError(
                    "CSV input must have an 'article_url' column and a 'site name' column"
                )
            for row in reader:
                article_urls[row["site_name"]].append(row["article_url"])
        # Iterate over each site
        for site_name in sorted(article_urls.keys()):
            # Override the configuration for the specified site
            site_config = site_configs[site_name]
            site_config["start_url"] = ""
            site_config["article_override_list"] = article_urls[site_name]
            # Create a dynamic spider class and register it with the crawler
            spider_class = dynamic_spider_class(site_config, args.max_articles)
            process.crawl(spider_class, config=site_config)

    # Start the crawler
    process.start()
コード例 #37
0
            columns['question'].append(question_text)
            columns['answer'].append(answer_text)
            columns['answer_html'].append(answer_html)

        today = date.today()

        columns["link"] = [
            "https://www.who.int/news-room/q-a-detail/q-a-coronaviruses"
        ] * len(columns["question"])
        columns["name"] = ["Q&A on coronaviruses (COVID-19)"] * len(
            columns["question"])
        columns["source"] = ["Robert Koch Institute (RKI)"] * len(
            columns["question"])
        columns["category"] = [""] * len(columns["question"])
        columns["country"] = ["DE"] * len(columns["question"])
        columns["region"] = [""] * len(columns["question"])
        columns["city"] = [""] * len(columns["question"])
        columns["lang"] = ["de"] * len(columns["question"])
        columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(
            columns["question"])

        return columns


if __name__ == "__main__":
    process = CrawlerProcess(
        {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})

    process.crawl(CovidScraper)
    process.start()
コード例 #38
0
 def handle(self, *args, **options):
     process = CrawlerProcess(get_project_settings())
     # process.crawl(CsSpider)
     process.crawl(ComeduSpider)
     process.start()
コード例 #39
0
    def handle(self, *args, **options):
        if options.get("drop_all"):
            self.warn("Apagando registros...")
            CityCouncilAgenda.objects.all().delete()
            CityCouncilAttendanceList.objects.all().delete()
            CityCouncilMinute.objects.all().delete()
            CityHallBid.objects.all().delete()
            Gazette.objects.all().delete()
            GazetteEvent.objects.all().delete()
            File.objects.all().delete()

        dispatcher.connect(self.save, signal=signals.item_passed)
        os.environ["SCRAPY_SETTINGS_MODULE"] = "scraper.settings"
        settings = get_project_settings()

        if options.get("scrapy_args"):
            scrapy_args = json.loads(options.get("scrapy_args"))
            settings.update(scrapy_args)

        process = CrawlerProcess(settings=settings)
        process.crawl(
            AgendaSpider,
            start_from_date=CityCouncilAgenda.last_collected_item_date(),
        )
        process.crawl(
            AttendanceListSpider,
            start_from_date=CityCouncilAttendanceList.last_collected_item_date(),
        )
        process.crawl(
            MinuteSpider, start_from_date=CityCouncilMinute.last_collected_item_date()
        )
        process.crawl(
            BidsSpider, start_from_date=CityHallBid.last_collected_item_date()
        )

        last_collected_gazette = Gazette.last_collected_item_date()
        if last_collected_gazette is None:
            process.crawl(LegacyGazetteSpider)
        process.crawl(
            ExecutiveAndLegislativeGazetteSpider,
            start_from_date=last_collected_gazette,
        )

        self.warn("Iniciando a coleta...")
        process.start()
        self.success("Pronto!")
コード例 #40
0
            else:
                request = 'http://maps.google.com/maps/api/geocode/json?address={},+{},+{}'.format(street, district,
                                                                                                   city)

            if idx > 0:
                sleep(random.randint(2, 10))  # prevent getting blocked from the google API

            r = requests.get(request)
            results = r.json()["results"]
            result_types = list(map(lambda x: ",".join(x["types"]), results))
            exact_matches_idx = [index for index, value in enumerate(result_types) if
                                 "street_address" in value or "establishment" in value or "premise" in value]

            if len(exact_matches_idx) != 1:
                coords.append("ambiguous address")
            else:
                idx = exact_matches_idx[0]
                location = results[idx]["geometry"]["location"]
                coords.append([location["lat"], location["lng"]])
        return coords


os.environ["SCRAPY_SETTINGS_MODULE"] = "is24crawler.settings"
print("------SETTINGS------ csv path: {}".format(settings.CSV_FILE_PATH))
print("------SETTINGS------ start page: {}".format(settings.PAGE_START))
print("------SETTINGS------ end page: {}".format(settings.PAGE_END))

process = CrawlerProcess(get_project_settings())
process.crawl(Immoscout24Bot)
process.start()
コード例 #41
0
ファイル: runspider.py プロジェクト: lemonxug/CarPriceSpider
import scrapy
from scrapy.crawler import CrawlerProcess
from CarPriceSpider.CarPriceSpider.spiders.xcar_area import XcarAreaSpider

process = CrawlerProcess()
process.crawl(XcarAreaSpider)
# process.crawl(MySpider2)
process.start(
)  # the script will block here until all crawling jobs are finished
コード例 #42
0
        parser = reqparse.RequestParser()
        parser.add_argument('query',
                            required=True,
                            help='A search term needs to be provided')
        parser.add_argument('brand',
                            required=True,
                            help='A search term needs to be provided')

        args = parser.parse_args()

        product = parse.urlencode({'query': args.query})
        brand = (parse.urlencode({'brand': args.brand})).split("=")[1]
        find = product + '+' + brand
        print(find)


s = get_project_settings()
process = CrawlerProcess(s)
process.crawl('ebay', find)

process.start()

print('Crawling Completed')

api.add_resource(SteamSearch, '/query')

if __name__ == '__main__':
    #app.run(host='0.0.0.0',port=5000,debug=True)
    app_server = WSGIServer(('0.0.0.0', 5000), app)
    app_server.serve_forever()
コード例 #43
0
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from media_parse import settings
from media_parse.spiders.VK_parser import VkParserSpider
group_name = ''
method = 'groups.getMembers'
method_2 = 'users.getSubscriptions'
access_token = ''

if __name__ == '__main__':
    crawl_settings = Settings()
    crawl_settings.setmodule(settings)
    crawl_procc = CrawlerProcess(settings=crawl_settings)
    crawl_procc.crawl(VkParserSpider, group_name, method, method_2,
                      access_token)
    crawl_procc.start()
コード例 #44
0
ファイル: main.py プロジェクト: beyondinfinite/crawler
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess

process = CrawlerProcess(get_project_settings())
process.crawl('xunzi')
process.start()
コード例 #45
0
ファイル: run.py プロジェクト: marcosValle/defmon
*  ____        __ __  __              *
* |  _ \  ___ / _|  \/  | ___  _ __   *
* | | | |/ _ \ |_| |\/| |/ _ \| '_ \  *
* | |_| |  __/  _| |  | | (_) | | | | *
* |____/ \___|_| |_|  |_|\___/|_| |_| *
*   			              *
***************************************
* DefMon Release 0.1                  *
* Coded by @__mvalle__		      *
***************************************
    """)

    parser = argparse.ArgumentParser(
        description=
        'Deface Monitor: recursively crawl a domain and check for defaced pages',
        epilog=
        'Example of use: ./run.py -d mydefaceddomain.com -u http://mydefaceddomain.com/hackedPages/'
    )
    parser.add_argument("--domain", '-d', help="Allowed domain", required=True)
    parser.add_argument("--url", '-u', help="Start URL", required=True)

    try:
        args = parser.parse_args()
    except:
        parser.print_help()
        exit(1)

    process = CrawlerProcess(get_project_settings())
    process.crawl('mySpider', domain=args.domain, start_url=args.url)
    process.start()
コード例 #46
0
    def start_requests(self):
        for url in self.urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        project = {}

        project["id"] = response.xpath('/html/body/div[2]/div[1]/div/div/div/div[1]/@data-id').get()
        project["user_id"] = response.xpath('/html/body/div[2]/div[1]/div/div/div/div[1]/div[1]/div/div[2]/div[1]/div[1]/a/img/@data-id').get()
        project["title"] = response.xpath('/html/body/div[2]/div[1]/div/div/div/div[1]/div[1]/div/div[2]/div[1]/figcaption/span/text()').get()

        creative_fields = []
        for s_creative_field in response.css('li.ProjectTools-projectField-2yD'):
            creative_fields.append(s_creative_field.css('a::text').get().lower())

        project["creative_fields"] = creative_fields

        tags = []
        for s_tag in response.css('a.ProjectTags-tagLink-Hh_'):
            tags.append(s_tag.css('a::text').get().lower().strip())

        project["tags"] = tags

        self.projects.append(project)


process = CrawlerProcess()

process.crawl(ProjectsSpider)
process.start()
コード例 #47
0
def run():
    process = CrawlerProcess()
    s = "JOBDIR={}".format(SAVE_PATH + "\\request")
    process.crawl(RrUserSpider)
    process.start()
コード例 #48
0
ファイル: data-2-scrape.py プロジェクト: serzh/prj-nlp
        for post in posts:
            yield {
                'text':
                re.sub(
                    "[\n\r\t]{1,}", "\n",
                    "".join(post.css('.messageText::text').extract()).strip(
                        "\n\t\r")),
                'author':
                post.css('.messageMeta .username.author::text').extract()[0],
                'date':
                post.css('.messageMeta .DateTime::text').extract()[0],
                'link':
                "http://forum.lvivport.com/" + post.css(
                    '.messageMeta .datePermalink::attr(href)').extract()[0]
            }
        next_page = response.css('.PageNav a.PageNavNext:not(.hidden)')[0]
        if next_page:
            yield response.follow(next_page, self.parse_inner_page)


process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'DOWNLOAD_DELAY': '0.25',
    'FEED_FORMAT': 'jl',
    'FEED_URI': 'output.jsonline',
    'FEED_EXPORT_ENCODING': 'utf-8'
})

process.crawl(LvivPortScraper)
process.start()
コード例 #49
0
            for tag in body.select('style'):
                tag.decompose()

            text = body.get_text(separator='\n')
            text = text.replace("\n", " ").replace("\t",
                                                   " ").replace("\r", " ")
            return text.lower()

        web_text = get_text_bs(web_text)

        exsit_list = checkActivity(act_list, web_text)
        activities = ', '.join(exsit_list)
        start_url = ', '.join(self.start_urls)

        item = {}
        item['start_url'] = start_url
        item['activities'] = activities
        return item


process = CrawlerProcess(settings={
    "FEEDS": {
        "data/items_23.json": {
            "format": "json"
        },
    },
})

process.crawl(ActivitySpider23)
process.start()
コード例 #50
0
def main():
    target_board = ['NSwitch']
    process = CrawlerProcess(get_project_settings())
    for board in target_board:
        process.crawl('PTTCrawler', board=board)
        process.start()
コード例 #51
0
__author__ = 'LeoDong'


from scrapy.utils.log import configure_logging
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from SAECrawlers.spiders.PagesCrawler import PagesCrawler
from util import tool

# tool.init_database()
# tool.init_working_path()

# configure_logging()
process = CrawlerProcess(get_project_settings())
process.crawl(PagesCrawler)
process.start()
コード例 #52
0
        Handles any exception that occurs while crawling and reissues a request to the server
        for the URL which failed.
        :param failure: Error details
        """
        # Logs all failures
        self.logger.error(repr(failure))

        # Checking the type of failure and handling it accordingly
        if failure.check(HttpError):
            self.logger.error('HttpError on %s', failure)

        elif failure.check(DNSLookupError):
            # This is the original request
            self.logger.error('DNSLookupError on %s', failure)

        elif failure.check(TimeoutError, TCPTimedOutError):
            self.logger.error('TimeoutError on %s', failure)

        # Reissuing a request
        yield scrapy.Request(failure,
                             dont_filter=True,
                             callback=self.download_data_files)


# Main program
process = CrawlerProcess(get_project_settings())
process.crawl(PathCrawler)
process.start()
os.remove(PathCrawler.pagination_file.name)
os.remove(PathCrawler.file_object.name)
コード例 #53
0
ファイル: run.py プロジェクト: shifei123/test
    args = parser.parse_args()

    settings = get_project_settings()
    if args.db_uri:
        settings.set('SQLALCHEMY_DATABASE_URI', args.db_uri)
    if args.user_agents:
        settings.set('USER_AGENT_FILE', args.user_agents)
    if args.log_file:
        settings.set('LOG_FILE', args.log_file)
    if args.log_level:
        settings.set('LOG_LEVEL', args.log_level)

    process = CrawlerProcess(settings)
    if args.crawler:
        for each_crawler in args.crawler:
            process.crawl(each_crawler)
    elif args.daily:
        process.crawl('douyu_daily')
        process.crawl('panda_daily')
        process.crawl('quanmin_daily')
        process.crawl('bilibili_daily')
    else:
        settings.set('CLOSESPIDER_TIMEOUT', 1000)
        process.crawl('bilibili')
        process.crawl('douyu')
        process.crawl('longzhu')
        process.crawl('panda')
        process.crawl('zhanqi')
        process.crawl('huya')
        process.crawl('quanmin')
        process.crawl('huomao')
コード例 #54
0
def start_scraping(start_url, scrap_mode):
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    })
    process.crawl(forum_spider.CategoriesSpider, start_url, scrap_mode)
    process.start()
コード例 #55
0
    custom_settings = {
        'DOWNLOAD_DELAY':
        '10',
        'USER_AGENT':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    }

    def start_requests(self):
        urls = getUrls()
        for url in urls:
            file = url.split("###V###")[1].strip()
            url = url.split("###V###")[0].strip()
            yield scrapy.Request(
                url=url,
                callback=lambda r, file=file: self.parse(r, file),
                dont_filter=True)

    def parse(self, response, file):
        directory = './data/kickstarter/creator/'
        filename = '%s.html' % file
        with open(os.path.join(directory, filename), 'wb') as f:
            f.write(response.url.strip())
            f.write(response.body)


print "Starting Crawl"
## Start crawling process and Spider
process = CrawlerProcess()
process.crawl(AmazonSpider)
process.start()
process.stop()
コード例 #56
0
       # yield hotel_reponse

    @staticmethod
    def get_hotelidlist():
        try:
            db_connection = MySQLdb.connect('localhost', 'root', 'welcome', 'hotel_livedb')
            cursor = db_connection.cursor()
            sql = 'select hotel_unique_id from desiya_hotels'
            cursor.execute(sql)
            records = cursor.fetchall()
            hotelid_list = [record[0] for record in records]
            hotelid_list = ['00000002', '00000004', '00000005', '00000007', '00000010', '00000011', '00000012', '00000013', '00000014', '00000015']
            return hotelid_list
        except Exception as e:
            print("Error to connect db")


if __name__ == '__main__':
    process = CrawlerProcess(settings={
        'FEED_FORMAT': 'json',
        'FEED_URI': 'items.json',
        'CONCURRENT_REQUESTS': '1',
        'DOWNLOAD_DELAY':'5',
        'ITEM_PIPELINES':{
            'pipelines.MySQLStorePipeline': 1,
        }
    })

    process.crawl(YatrapiSpider)
    process.start()
コード例 #57
0
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

from corona_crawler.corona_crawler.spiders.corona import CoronaSpider

process = CrawlerProcess(get_project_settings())
process.crawl(CoronaSpider)
process.start()
コード例 #58
0
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from main.spiders.main_spider import MainSpider

settings = get_project_settings()
settings['ITEM_PIPELINES'] = {'main.pipelines.JsonWriterPipeline': 1}

process = CrawlerProcess(settings)
process.crawl(MainSpider)
process.start()
コード例 #59
0
ファイル: runner.py プロジェクト: jdk-21/immoscout
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from immospider.spiders.immoscout import ImmoscoutSpider


process = CrawlerProcess(settings=get_project_settings())
#process.crawl(ImmoscoutSpider, url="https://www.immobilienscout24.de/Suche/S-T/Wohnung-Miete/Berlin/Berlin/-/2,50-/60,00-/EURO--1000,00")
process.crawl(ImmoscoutSpider, url="https://www.immobilienscout24.de/Suche/S-T/Wohnung-Kauf/Nordrhein-Westfalen/Dortmund/-/-/-/EURO-50000,00-150000,00?enteredFrom=result_list")
process.start()


# https://github.com/balzer82/immoscraper/blob/master/immoscraper.ipynb
# Input parameter for later
#b = 'Sachsen' # Bundesland
#s = 'Dresden' # Stadt
#k = 'Haus' # Wohnung oder Haus
#w = 'Kauf' # Miete oder Kauf
#url = 'http://www.immobilienscout24.de/Suche/S-T/P-%s/%s-%s/%s/%s?pagerReporting=true' % (page, k, w, b, s)
コード例 #60
0
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

import sys

process = CrawlerProcess( get_project_settings())

name = ["photos_spider"]

process.crawl('photos_spider')

process.start()