Exemple #1
0
    def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str):
        busy.value = 1
        if os.path.exists("data.json"):
            os.remove("data.json")
        print("Started crawling task")
        process = CrawlerProcess(get_project_settings())
        process.crawl("od_links", base_url=website.url)
        process.start()
        print("Done crawling")

        self.db.import_json("data.json", website)
        os.remove("data.json")
        print("Imported in SQLite3")

        if post_id:
            # Reply to post
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment({"": stats}, website.id)
            print(comment)
            if "total_size" in stats and stats["total_size"] > 10000000:
                post = self.reddit_bot.reddit.submission(post_id)
                self.reddit_bot.reply(post, comment)
                pass
            else:
                self.reddit_bot.log_crawl(post_id)

        elif comment_id:
            # Reply to comment
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id)
            print(comment)
            reddit_comment = self.reddit_bot.reddit.comment(comment_id)
            self.reddit_bot.reply(reddit_comment, comment)
        busy.value = 0
        print("Done crawling task")
Exemple #2
0
    def service_sis(self):
        process = CrawlerProcess({
            'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        })

        process.crawl(worker.Worker)
        process.start()  # the script will block here until the crawling is finished
Exemple #3
0
Fichier : t.py Projet : szqh97/test
def main():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
 
    def catch_item(sender, item, **kwargs):
        print "Got:", item
 
    dispatcher.connect(catch_item, signal=signals.item_passed)
 
    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False
 
    # set up crawler
    from scrapy.crawler import CrawlerProcess
 
    crawler = CrawlerProcess(settings)
    crawler.start()
 
    # schedule spider
 
    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
Exemple #4
0
def runSpiderProcess(spider_cls, *args, **kwargs):
    """
    Helper method that starts a spider with the given init arguments, waits for it to complete, and returns the
    items it yielded in a list.
    :param spider_cls: the spider class to run
    :param args: the indexed arguments to the spider
    :param kwargs: the keyword arguments to the spider
    :return: a list of items yielded by the spider
    """
    process = CrawlerProcess()
    process.crawl(spider_cls, *args, **kwargs)

    final_result = []

    def _nab_item(item):
        # FIXME: this silly dance of encoding and decoding is to prevent scrapy items from being returned to celery
        # FIXME: celery can't serialize them, so it throws a rather opaque error, but it's fine with lists and dicts
        final_result.append(json.loads(scrapy_encoder.encode(item)))

    for crawler in process.crawlers:
        crawler.signals.connect(_nab_item, item_scraped)

    process.start()
    process.stop()

    return final_result
Exemple #5
0
def main(argv):

	try:
		opts, args = getopt.getopt(argv, "ch:t:s:", ['title=', 'section='])
	except getopt.GetoptError:
		print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>'
			sys.exit()
		elif opt == '-c':
			# start crawling article here
			print "crawling"
			process = CrawlerProcess(get_project_settings())
			process.crawl(BBCArticleSpider)
			process.start()
		elif opt in  ('-t', '--title'):
			print "search by title"
			# start searching article by title
			results = BBCArticleItem.fetch_by_title(arg)
			for result in results:
				print result
		elif opt in ('-s', '--section'):
			print "search by section"
			# start searching article by section
			results = BBCArticleItem.fetch_by_section(arg)
			for result in results:
				print result
Exemple #6
0
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback, scrapy_settings=None):
    """
    Launch crawl job for JobSpider class
    :param scrapy_settings: dict of setting merged with CrawlerProcess default settings
    :param debug: (bool) Activate or disable debug
    :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error)
    :param connector: Connector instance
    :param spiders_classes: JobSpider class list
    :return: spider instance
    """
    if debug:
        dispatcher.connect(spider_error_callback, signals.spider_error)

    settings = {
        'ITEM_PIPELINES': {
            'pyjobs_crawlers.pipelines.RecordJobPipeline': 1,
        },
        'connector': connector,
        'LOG_ENABLED': False,
        'DOWNLOAD_DELAY': 1 if not debug else 0,
    }
    if scrapy_settings:
        settings.update(scrapy_settings)

    process = CrawlerProcess(settings)

    for spider_class in spiders_classes:
        process.crawl(spider_class, debug=debug)

    spiders = []
    for crawler in list(process.crawlers):
        spiders.append(crawler.spider)
    process.start()

    return spiders
Exemple #7
0
def main():
	"""Rutina principal para la ejecución del Spider"""
	# set up signal to catch items scraped
	from scrapy import signals
	from scrapy.xlib.pydispatch import dispatcher

	def catch_item(sender, item, **kwargs):
		print "Item Extraido:", item
	dispatcher.connect(catch_item, signal=signals.item_passed)

	from scrapy.conf import settings
	settings.overrides['LOG_ENABLED'] = False

	# setup crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)
	crawler.install()
	crawler.configure()

	# definir el spider para el crawler
	crawler.crawl(BloggerSpider())

	# iniciar scrapy
	print "STARTING ENGINE"
	crawler.start()
	print "ENGINE STOPPED"
Exemple #8
0
class CrawlerScript():

    def __init__(self):
        settings = get_project_settings()
        settings.set('LOG_ENABLED', False, priority='cmdline')
        #settings.overrides['LOG_ENABLED'] = False
        self.crawler = CrawlerProcess(settings)
        self.items = []
        SignalManager(dispatcher.Any).connect(self._item_passed, signal=signals.item_scraped)

    def _item_passed(self,item,response,spider):
        self.items.append(item)

    def _crawl(self, q, queue):
        self.crawler.crawl(BingSpider, q=q)
        self.crawler.start()
        self.crawler.stop()
        queue.put(self.items)

    def crawl(self, q):
        queue = Queue()
        p = Process(target=self._crawl, args=[q, queue])
        p.start()
        p.join()
        return queue.get(True)
Exemple #9
0
def get_scraped_sites_data():
    """Returns output for venues which need to be scraped."""
    class RefDict(dict):
        """A dictionary which returns a reference to itself when deepcopied."""
        def __deepcopy__(self, memo):
            return self

    # Hack: we pass a dictionary which can't be deep-copied into the settings
    # so as to _return_ the scraper output. As far as I can tell, this is the
    # only way to return the scraper output to the script itself.
    output = RefDict()

    settings = Settings({
        'LOG_ENABLED': False,
        'ITEM_PIPELINES': {
            'mgrok.pipelines.JsonWriterPipeline': 1
            },
        'PIPELINE_OUTPUT': output,
        'USER_AGENT': 'Chrome/41.0.2228.0'
        })

    crawler_process = CrawlerProcess(settings)
    for spider in SCRAPY_SPIDERS:
        crawler_process.crawl(spider)

    crawler_process.start()

    return output
Exemple #10
0
def run(urls, city):
    process = CrawlerProcess()
    spiders = [make_spider(artist, url, city) for artist, url in urls]
    for spider_cls in spiders:
        process.crawl(spider_cls)
    # the script will block here until the crawling is finished
    process.start()
Exemple #11
0
class CrawlerScript():

    def __init__(self):
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
        self.items = []
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def _crawl(self, queue, spider_name):
        spider = self.crawler.spiders.create(spider_name)
        if spider:
            self.crawler.queue.append_spider(spider)
        self.crawler.start()
        self.crawler.stop()
        queue.put(self.items)

    def crawl(self, spider):
        queue = Queue()
        p = Process(target=self._crawl, args=(queue, spider,))
        p.start()
        p.join()
        return queue.get(True)
    def handle(self, *args, **options):
        # It would be better to pass this in as a parameter to PayoutSpider
        global start_date
        start_date = datetime.datetime(2015, 1, 1, tzinfo=pytz.UTC)

        delete = options.get('delete')
        delete_all = options.get('delete_all')
        retrieve_all = options.get('retrieve_all')

        previous_payout = None
        previous_payouts = codementor_models.Payout.objects.all().order_by('-date')
        if delete_all or (delete and previous_payouts.count() == 0):
            codementor_models.Review.objects.all().delete()
            codementor_models.Session.objects.all().delete()
            codementor_models.Payout.objects.all().delete()
            codementor_models.Payment.objects.all().delete()
        elif delete:
            previous_payout = previous_payouts[0]
            codementor_models.Review.objects.filter(date__gt=start_date).delete()
            codementor_models.Session.objects.filter(started_at__gt=start_date).delete()
            previous_payout.delete()
            codementor_models.Payment.objects.filter(payout__isnull=True).delete()

        if not retrieve_all and previous_payout:
            start_date = previous_payout.date

        process = CrawlerProcess({
            'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        })

        process.crawl(PayoutSpider)
        process.start()
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.settings import CrawlerSettings
      
    def catch_item(sender, item, **kwargs):
        #log.msg("Got:" + str(item))
        pass
       
    dispatcher.connect(catch_item, signal=signals.item_passed)

    """clean storage"""
    scraperwiki.sqlite.execute("drop table if exists "+spider.name)
    scraperwiki.sqlite.commit()


    from scrapy.crawler import CrawlerProcess

    settings = CrawlerSettings(values=settings)

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)

    #log.start(loglevel='DEBUG')

    crawler.start()
    def run(self):
        settings = get_project_settings()
        process = CrawlerProcess(settings)

        process.crawl('stackoverflow',
                      )
        process.start()
Exemple #15
0
 def get(self):
     while True:
         process = CrawlerProcess(get_project_settings())
         process.crawl('iqiyi')
         process.start()
         time.sleep(3000)
     self.finish()
def spiderCrawl(bandname):
   createLink(bandname)
   settings = get_project_settings()
   settings.set('USER_AGENT','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)')
   process = CrawlerProcess(settings)
   process.crawl(MySpider)
   process.start()
def startSpiderTest(group_type,spider_type,spider_group_name,spider_name):
    #调用Scrapy内部方法
    settings = get_project_settings()
    #实例化一个爬虫进程
    crawlerProcess = CrawlerProcess(settings)

    #创建一个爬虫,一个爬取处理器可以,运行多个爬取。
    crawler = crawlerProcess.create_crawler(spider_name)

    #设置爬虫的状态。 当爬虫发出该信号后,调用响应的方法。
    crawler.signals.connect(spiderSignal.startSingnal, signals.spider_opened)
    crawler.signals.connect(spiderSignal.errorSingnal, signals.spider_error)
    crawler.signals.connect(spiderSignal.stopSingnal, signals.spider_closed)

    #获取爬取类
    spiderConf = Spider_Dict[group_type][spider_type]
    spiderArgs = spiderConf[1].copy()
    spiderArgs["name"] = spider_name
    spiderArgs["redis_key"] = spider_name
    spiderArgs["spider_type"] = spider_type
    spiderArgs["spider_group_name"] = spider_group_name
    spiderArgs["task_id"] = "-1"

    spider = spiderConf[0](**spiderArgs)

    #给爬虫设置爬取类
    crawler.configure()
    crawler.crawl(spider)

    #爬虫启动。
    crawlerProcess.start()
    crawlerProcess.stop()
Exemple #18
0
class MySpiderProcess1(scrapy.Spider):
    def __init__(self, name, urls):
        self.name = name
        self.start_urls = urls
        scrapy.Spider.__init__(self)

    def parse(self, response):
        print('parse response')

    def _crawl(self):
        settings = Settings()
        settings.set('ITEM_PIPELINES', {
            'app.pipelines.JsonWriterPipeline': 300
        })
        self.process = CrawlerProcess(settings)
        self.process.crawl(self, self.name, self.start_urls)
        self.process.start()
        # self.process.stop()
        # self.process.join()

    def start(self):
        p = Process(target=self._crawl)
        p.start()
        p.join()

    #
    # def start(self):
    #     self._crawl()

    def stop(self):
        self.process.stop()
 def handle(self, *args, **options):
     from scrapy import signals
     from scrapy.xlib.pydispatch import dispatcher
     
     def catch_item(sender, item, **kwargs):
         print "Got:", item
         
     dispatcher.connect(catch_item, signal=signals.item_passed)
     
     from scrapy.conf import settings
     settings.overrides['LOG_ENABLED'] = True
     
     from scrapy.crawler import CrawlerProcess
     
     crawler = CrawlerProcess(settings)
     crawler.install()
     crawler.configure()
     
     from alescspider.spiders import *
     spiders = [deputado_spider.DeputadoSpider()]
     #spiders = [presenca_spider.PresencaSpider(), votos_spider.VotosSpider(), deputado_spider.DeputadoSpider()]
     for spider in spiders:
         crawler.queue.append_spider(spider)
     
     print "STARTING ENGINE"
     crawler.start()
     print "ENGINE STOPPED"
     
def main():
    """Index alexa demographics
    """

    engine = db_connect()
    Session = sessionmaker(bind=engine)
    session = Session()

    settings = get_project_settings()
    settings.set('ITEM_PIPELINES',
                 {'demographic_scraper.demographic_scraper.pipelines.WebsiteDemographicPipeline': 300})
    settings.set('EXTENSIONS',
                 {'scrapy.telnet.TelnetConsole': None,})


    process = CrawlerProcess(settings)
    for website in session.query(WebsitesContent).all():
        demographic = list(session.query(Websites).filter_by(link=website.link))
        if len(demographic) is 0:
            url = website.link
            print website.link
            AlexaSpider.name = url
            process.crawl(AlexaSpider, url=url, db_session=session)
    process.start()
    process.stop()

    session.close()
def scrape(spider):
    with transaction.atomic(), reversion.create_revision():
        process = CrawlerProcess(DEFAULT_CRAWLER_OPTIONS)
        process.crawl(spider)
        # the script will block here until the crawling is finished
        process.start()
    return
Exemple #22
0
    def handle(self, *args, **options):
        setting = {
            'USER_AGENT': options['user_agent'],
            'DOWNLOAD_DELAY': options['download_delay'],
            'LOG_FILE': settings.SCRAPY_LOG_FILE,
            'LOG_LEVEL': settings.SCRAPY_LOG_LEVEL,
        }

        if options['proxy_list']:
            try:
                f = open(options['proxy_list'])
            except IOError as e:
                raise CommandError('cannot open proxy list file for read')

            # Retry many times since proxies often fail
            setting['RETRY_TIMES'] = 10
            # Retry on most error codes since proxies fail for different reasons
            setting['RETRY_HTTP_CODES'] = [500, 503, 504, 400, 403, 404, 408]
            setting['DOWNLOADER_MIDDLEWARES'] = {
                'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
                'spider.randomproxy.RandomProxy': 100,
                'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
            }
            setting['PROXY_LIST'] = options['proxy_list']

        process = CrawlerProcess(setting)

        process.crawl(BaiduSpider)
        process.start()
Exemple #23
0
	def __init__(self, titlesfile = None, platform = None, region = None):

		# set default encoding to utf8 for parsing and logging
		# utf-8 characters in console and files
		#
		reload(sys)
		sys.setdefaultencoding('utf8')
        
		configure_logging(install_root_handler=False)
		logging.basicConfig(
			filename='export.log',
			filemode = 'a',
			format='%(levelname)s: %(message)s',
			level=logging.INFO
		)
                				
		# identify platform
		#
		self.platform = platform
		if self.platform is None:
			logging.error('No platform found! Pass it as an argument.')
			return
		else:			
			platformId = platforms.getId(self.platform)
			if platformId is None:
				logging.error('Platform ' + self.platform + ' not supported.')
				return
						
		self.titlesfile = titlesfile
		self.region = region		
		if self.region is None:
			self.region = "Worldwide"
		
		if titlesfile:		
		
			titles = []
			urls = []
			
			with open( self.titlesfile ) as f:
				titles = f.read().splitlines()
				
			for title in titles:
				logging.debug('Submitting title:' + title )
				urls.append(
					'http://mobygames.com/search/quick' +
					'?q=' + title +
					'&p=' + platformId +
					'&search=Go'
					'&sFilter=1'
					'&sG=on'
					'&search_title=' + urllib.quote( title ) + 
					'&search_platform=' + urllib.quote(self.platform) +
					'&search_region=' + urllib.quote(self.region)
				)
				
			process = CrawlerProcess(get_project_settings())
			process.crawl(MobygamesSpider, start_urls=urls)
			process.start()									
		else:
			logging.warning('No file.')
Exemple #24
0
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback):
    """
    Launch crawl job for JobSpider class
    :param debug: (bool) Activate or disable debug
    :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error)
    :param connector: Connector instance
    :param spiders_classes: JobSpider class list
    :return: spider instance
    """
    if debug:
        dispatcher.connect(spider_error_callback, signals.spider_error)

    process = CrawlerProcess({
        'ITEM_PIPELINES': {
            'pyjobs_crawlers.pipelines.RecordJobPipeline': 1,
        },
        'connector': connector,
        'LOG_ENABLED': False
    })

    for spider_class in spiders_classes:
        process.crawl(spider_class)

    spiders = []
    for crawler in list(process.crawlers):
        spiders.append(crawler.spider)
    process.start()

    return spiders
Exemple #25
0
def _crawl(path=None):
     crawl = CrawlerProcess({
         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
     })
     crawl.crawl(ProvinceSpider)
     crawl.start()
     crawl.stop()
Exemple #26
0
    def Test_Scapy(self):
        spider = FtpSpider()

        process = CrawlerProcess({"USER_AGENT": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"})

        process.crawl(spider)
        process.start()
Exemple #27
0
def scrapeando():
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        """Rellenamos la BD"""
        for i in enumerate(item.items()):
            x = i[0]
            query = "INSERT INTO book (Nombre ,Autor, Editorial ,Fecha, Precio, Link) VALUES ("+decodifica(item['Nombre'][x])+","+decodifica(item['Autor'][x])+","+decodifica(item['Editorial'][x])+","+decodifica(item['Fecha'][x])+","+decodifica(item['Precio'][x])+","+decodifica("http://www.casadellibro.com"+item['Link'][x])+");"
            db.micursor.execute(query)
            db.conexion.commit()
        print item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    book = BookSpider()
    book.busqueda=unicode(search.getbusqueda())
    crawler.crawl(book)
    print "Start scraping to la Casa del Libro"
    crawler.start()
    print "End scraping to la Casa del Libro"
    crawler.stop()
Exemple #28
0
def crawl(ctx, spiders, stats):
    """
    Crawl one or many or all pages.

    What spider(s) to run is determined in the following order:

      1. Spider(s) given as argument(s)

      2. Spider(s) specified in the configuration file

    Note that if a spider is given as an argument, the spiders in the
    configuration file are ignored. All available spiders will be used to
    crawl if no arguments are given and no spiders are configured.
    """
    settings = ctx.obj["settings"]
    if stats:
        settings.set("STATS_CLASS", "scrapy.statscollectors.MemoryStatsCollector")

    # Start a new crawler process.
    process = CrawlerProcess(settings)
    spiders = spiders_to_crawl(process, spiders)
    if not spiders:
        logger.error("Please specify what spiders you want to run!")
    else:
        for spider in spiders:
            logger.info("Starting crawl of {} ...".format(spider))
            process.crawl(spider)

    process.start()

    if settings.getbool("HTTPCACHE_ENABLED"):
        run_cleanup_cache(settings)
Exemple #29
0
def ScrapeSite():
    db = 'crunchbase_startups'
    sitedomain = raw_input("Enter site domain: ") # get user input
    sitedomain = parse_base_url(sitedomain) # clean url
    
    sql = 'SELECT text FROM {} WHERE siteurl = %s'.format(db)
    
    cur.execute(sql, sitedomain)
    sitetext = cur.fetch()
    
    if sitetext != '': # what does an empty ping return?
        print 'Site already scraped.'
        return sitetext
    
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'ITEM_PIPELINES': {'pipelines.UserInputPipeline': 100},
        'DEPTH_LIMIT': 2,
        'DOWNLOAD_HANDLERS': {'s3': None,}
        ,'LOG_LEVEL': 'INFO'
    })
    
    process.crawl(SoloSpider, domain = sitedomain)
    process.start()
    
    # presumably finished here - pull newly loaded sitetext for domain
    
    cur.execute(sql, sitedomain)
    return cur.fetch()
def main():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    options = parse_args()

    dispatcher.connect(catch_item, signal=signals.item_passed)

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = True
    settings.overrides['DEPTH_LIMIT'] = 2


    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    spider = MySpider(input=options.input, output=options.output)
    crawler.queue.append_spider(spider)

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
Exemple #31
0
def execute(tags_urls):
    process = CrawlerProcess({'LOG_LEVEL': LOG_LEVEL})

    process.crawl(TagSpider, tags=';'.join(tags_urls))
    process.start()
    process.stop()
Exemple #32
0
import scrapy
import sys, getopt
from scrapy.crawler import CrawlerProcess
from ypscraper.spiders.yellowpages import YellowpagesSpider
from scrapy.utils.project import get_project_settings

process = CrawlerProcess(get_project_settings())

process.crawl('yellowpages', max_listings='100', infile='searches.json')
process.start()
Exemple #33
0
#process.crawl(fun.FunSpider())
#process.start(stop_after_crawl=False)

process1 = CrawlerProcess({
    'USER_AGENT':
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    #'DOWNLOAD_DELAY': 1,
    #'RETRY_HTTP_CODES': {500, 502, 503, 504, 522, 524, 408, 456},
    #'CONCURRENT_REQUESTS_PER_IP': 32,
    #'CONCURRENT_REQUESTS_PER_DOMAIN':32,
    #'COOKIES_ENABLED': False
    #'COOKIES_ENABLED': False
})
from stock import ticker
ticker.TickerTodayPriceSpider.CONCURRENT_REQUESTS_PER_IP = 4
process1.crawl(ticker.TickerTodayPriceSpider())

process1.crawl(ticker.TickerTodayPriceSpider1())

process1.crawl(ticker.TickerTodayPriceSpider2())

process1.crawl(ticker.TickerTodayPriceSpider3())

from stock import full_price_update
process1.crawl(full_price_update.FullPriceSpider())

from stock import min_price_update
process1.crawl(min_price_update.MinPriceDailySpider())

process1.start()
Exemple #34
0
    def test_37(self):
        # Basic, useful data
        local_dir = self.sitemaps_spider.local_dir
        website_dir = self.sitemaps_spider.website_folder

        # Disable useless messages from the engine.
        # To be fair, they can be really useful in usual development context,
        # but here they fill up our tests output.
        # Based on: https://stackoverflow.com/a/33204694
        logging.getLogger('scrapy').setLevel(logging.WARNING)
        logging.getLogger('scrapy').propagate = False

        # Single element sitemap
        crawler_process = CrawlerProcess()
        crawler1 = CrawlerWithResults(self.sitemaps_spider)
        crawler_process.crawl(crawler1)
        crawler1.spider.sitemap_urls = [
            local_dir + website_dir + "/sitemap1.xml"
        ]
        crawler1.spider.name += "1"
        # Multiple element sitemap
        crawler2 = CrawlerWithResults(self.sitemaps_spider)
        crawler_process.crawl(crawler2)
        crawler2.spider.sitemap_urls = [
            local_dir + website_dir + "/sitemap2.xml"
        ]
        crawler2.spider.name += "2"
        # Multiple sitemaps within a sitemap
        crawler3 = CrawlerWithResults(sitemaps.LocalSitemapsSpider)
        crawler_process.crawl(crawler3)
        crawler3.spider.sitemap_urls = [
            local_dir + website_dir + "/sitemap3.xml"
        ]
        crawler3.spider.name += "3"
        # Multiple sitemaps within a sitemap
        crawler4 = CrawlerWithResults(sitemaps.LocalSitemapsSpider)
        crawler_process.crawl(crawler4)
        crawler4.spider.sitemap_urls = [
            local_dir + website_dir + "/sitemap4.xml"
        ]
        crawler4.spider.name += "4"

        # We can't run multiple processes in one script, due to the Twisted Reactor.
        # This is why we test everything in a single test unit.
        # Kind of bad, but it is the only way.
        crawler_process.start()

        # Check that all the tests are good
        # Single page sitemap
        self.assertEqual(crawler1.items, [{'1': '1'}])
        # Multiple page sitemap
        self.assertEqual(
            sorted(
                (key, item[key]) for item in crawler2.items for key in item),
            [(str(x), str(x)) for x in range(2, 7)])
        # Sitemaps within a sitemap
        self.assertEqual(
            sorted(
                (key, item[key]) for item in crawler3.items for key in item),
            [(str(x), str(x)) for x in range(1, 7)])
        # Compressed sitemaps within a sitemap
        self.assertEqual(
            sorted(
                (key, item[key]) for item in crawler4.items for key in item),
            [(str(x), str(x)) for x in range(1, 7)])
def crawl_article_pro():
    process = CrawlerProcess(get_project_settings())
    process.crawl(ArticleSpider)
    process.start()
Exemple #36
0
def craw(repositories):
	process = CrawlerProcess(get_project_settings())
	process.crawl('repositories', repositories=repositories)
	process.start()
Exemple #37
0
#)
#configure_logging({'LOG_STDOUT': True})


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--max_page', default=1, type=int)
    parser.add_argument('--cat_id', default='explore', type=str)
    parser.add_argument('--cat_name', default='话题精选', type=str)
    parser.add_argument('--start_url', default='/group/explore', type=str)
    parser.add_argument('--url_keywords', default='topic', type=str)
    parser.add_argument('--sleep', default=3, type=int)
    parser.add_argument('--only_image', default=0, type=int)
    args = parser.parse_args()
    settings = get_project_settings()
    settings.set('MAX_PAGE', args.max_page, 'project')
    settings.set('CAT_ID', args.cat_id, 'project')
    settings.set('CAT_NAME', args.cat_name, 'project')
    settings.set('START_URL', args.start_url, 'project')
    settings.set('URL_KEYWORDS', args.url_keywords, 'project')
    settings.set('DOWNLOAD_DELAY', args.sleep, 'project')
    settings.set('ONLY_IMAGE', args.only_image, 'project')
    return settings


if __name__ == "__main__":
    settings = parse_args()
    crawler_process = CrawlerProcess(settings)
    crawler_process.crawl(DoubanScrapy)
    crawler_process.start()
Exemple #38
0
    """Small spider for downloading large files from gdrive"""
    name = 'Google Drive Large File Downloader'


    def __init__(self, url, file_name):
        self.file_name = file_name
        self.start_urls = [url]

    def parse(self, response):
        """Parses Google's warning page."""
        downlaod_url = 'https://drive.google.com' + \
            response.xpath('//div[@class="uc-main"]'
                           + '/div[@id="uc-text"]/a/@href').extract()[0]
        yield Request(url=downlaod_url, callback=self.save_file,
                      meta={'download_maxsize' : 0, 'download_timeout' : 1200})

    def save_file(self, response):
        """Saves downloaded file."""
        with open(self.file_name, 'wb') as large_file:
            large_file.write(response.body)


if __name__ == "__main__":
    # Reduce Scrapy logger verbosity.
    logging.disable(logging.WARNING)
    spiderproc = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    })
    spiderproc.crawl(GoogleDriveSpider, url=sys.argv[1], file_name=sys.argv[2])
    spiderproc.start()
                '------------------------------ split ------------------------------'
            )
            import pprint
            pprint.pprint(d)
            yield d


# 配置在单脚本情况也能爬取的脚本的备选方案,使用项目启动则下面的代码无效
if __name__ == '__main__':
    import os, time
    from scrapy.crawler import CrawlerProcess
    timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())  # 年月日_时分秒
    filename = 'v{}.json'.format(timestamp)  # 这是输出文件名字(解开 'FEED_URI' 配置注释生效)
    jobdir = 'JOBDIR/kiCYjoVAmJ'  # 这是队列信息地址(解开 'JOBDIR'   配置注释生效)

    p = CrawlerProcess({
        'TELNETCONSOLE_ENABLED': False,  # 几乎没人使用到这个功能,直接关闭提高爬虫启动时间
        'MEDIA_ALLOW_REDIRECTS': True,  # 允许图片下载地址重定向,存在图片下载需求时,请尽量使用该设置
        'LOG_LEVEL': 'INFO',  # DEBUG , INFO , WARNING , ERROR , CRITICAL
        # 'JOBDIR':                   jobdir,     # 解开注释则增加断点续爬功能
        # 任务队列、任务去重指纹、任务状态存储空间(简单来说就是一个文件夹)
        # 'FEED_URI':                 filename,   # 下载数据到文件
        # 'FEED_EXPORT_ENCODING':     'utf-8',    # 在某种程度上,约等于 ensure_ascii=False 的配置选项
        # 'FEED_FORMAT':              'json',     # 下载的文件格式,不配置默认以 jsonlines 方式写入文件,
        # 支持的格式 json, jsonlines, csv, xml, pickle, marshal
        # 'DOWNLOAD_TIMEOUT':         8,          # 全局请求超时,默认180。也可以在 meta 中配置单个请求的超时( download_timeout )
        # 'DOWNLOAD_DELAY':           1,          # 全局下载延迟,这个配置相较于其他的节流配置要直观很多
    })
    p.crawl(VSpider)
    p.start()
Exemple #40
0
def run_scraper():
    process = CrawlerProcess(
        {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
    process.crawl(HNScrapy)
    process.start()
Exemple #41
0
import scrapy
import sys
sys.path.append(
    '..')  #TODO add example directory to sys.path without this command
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from Diversity import DiversityCrawler
from Rent import RentCrawler
from Transport import TransportCrawler

if __name__ == '__main__':
    process = CrawlerProcess(get_project_settings())
    process.crawl(RentCrawler)
    process.crawl(DiversityCrawler)
    process.crawl(TransportCrawler)
    process.start(stop_after_crawl=True)
Exemple #42
0
                ")  .price::text").extract()
            fjson["relative_img_link"] = response.xpath(
                "//div[@class ='image-gradient']/img/@src").extract()[t]
            fjson["abs"] = base_url + fjson["relative_img_link"]
            #fjson['image_urls'] = [url_join_imgz(base_url,relative_img_link)] for t in relative_img_link ]
            #fjson["relative_img_link"] = response.xpath("//div[@class ='image-gradient']/img/@src").extract()[t]
            fjson["tags"] = response.css(".CampaignPackages-items:nth-child(" +
                                         str(t) +
                                         ")  .tag-names::text").extract()
            fjson["img_link2"] = response.css(
                ".CampaignPackages-items:nth-child(" + str(t) +
                ") .lazyloaded").extract()

            print(fjson)
            print(
                "********************************************************************"
            )
            print(t)

            with open(base_path + 'data.csv', "a") as fo:
                fo.write("\n" + str(fjson))
                fo.flush()


process = CrawlerProcess({
    'USER_AGENT':
    'Mozilla/5.0 (Linux; Android 8.0.0; SM-G960F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36'
})
process.crawl(SatsaSpider)
process.start()  # the script will block here until the crawling is finished
Exemple #43
0
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

settings = get_project_settings()

crawler = CrawlerProcess(settings)
crawler.crawl('ch3.18-email')  # 爬虫名
crawler.start()




Exemple #44
0
 def start_spider(self):
     p=CrawlerProcess(settings=self.settings)
     p.crawl(JudgementSpider)
     p.start()
Exemple #45
0
def main():
    process = CrawlerProcess(get_project_settings())
    process.crawl('PTTCrawler',
                  urls_txt_path='./target_urls.txt',
                  output_path='test.json')
    process.start()
Exemple #46
0
class ScannerApp:
    """A scanner application which can be run."""
    def __init__(self):
        """
        Initialize the scanner application.
        Takes input, argv[1], which is directly related to the scan job id in the database.
        Updates the scan status and sets the pid.
        """
        self.scan_id = sys.argv[1]

        # Get scan object from DB
        self.scan_object = Scan.objects.get(pk=self.scan_id)
        self.scan_object.set_scan_status_start()
        self.scanner = Scanner(self.scan_id)

    def run(self):
        """Run the scanner, blocking until finished."""
        settings = get_project_settings()

        self.crawler_process = CrawlerProcess(settings)

        if hasattr(self.scan_object, 'webscan'):
            self.start_webscan_crawlers()
        else:
            self.start_filescan_crawlers()

        # Update scan status
        self.scan_object.set_scan_status_done()

    def start_filescan_crawlers(self):
        self.sitemap_spider = None
        self.scanner_spider = self.setup_scanner_spider()
        self.start_crawlers()

    def start_webscan_crawlers(self):
        # Don't sitemap scan when running over RPC or if no sitemap is set on scan
        if not self.scan_object.scanner.process_urls:
            if len(self.scanner.get_sitemap_urls()) is not 0\
                    or len(self.scanner.get_uploaded_sitemap_urls()) is not 0:
                self.sitemap_spider = self.setup_sitemap_spider()
            else:
                self.sitemap_spider = None
        else:
            self.sitemap_spider = None

        self.scanner_spider = self.setup_scanner_spider()

        self.start_crawlers()
        if (self.scan_object.webscan.do_link_check
                and self.scan_object.webscan.do_external_link_check):
            # Do external link check
            self.external_link_check(self.scanner_spider.external_urls)

    def start_crawlers(self):
        # Run the crawlers and block
        logging.info('Starting crawler process.')
        self.crawler_process.start()
        logging.info('Crawler process started.')

    def handle_killed(self):
        """Handle being killed by updating the scan status."""
        # self.scan_object = Scan.objects.get(pk=self.scan_id)
        self.scan_object.set_scan_status_failed()
        self.scan.logging_occurrence("SCANNER FAILED: Killed")
        logging.error("Killed")

    def setup_sitemap_spider(self):
        """Setup the sitemap spider."""
        crawler = self.crawler_process.create_crawler(SitemapURLGathererSpider)
        self.crawler_process.crawl(
            crawler,
            scanner=self.scanner,
            runner=self,
            sitemap_urls=self.scanner.get_sitemap_urls(),
            uploaded_sitemap_urls=self.scanner.get_uploaded_sitemap_urls(),
            sitemap_alternate_links=True)
        return crawler.spider

    def setup_scanner_spider(self):
        """Setup the scanner spider."""
        crawler = self.crawler_process.create_crawler(ScannerSpider)
        crawler.signals.connect(self.handle_closed,
                                signal=signals.spider_closed)
        crawler.signals.connect(self.handle_error, signal=signals.spider_error)
        crawler.signals.connect(self.handle_idle, signal=signals.spider_idle)
        self.crawler_process.crawl(crawler, scanner=self.scanner, runner=self)
        return crawler.spider

    def get_start_urls_from_sitemap(self):
        """Return the URLs found by the sitemap spider."""
        if self.sitemap_spider is not None:
            logging.debug('Sitemap spider found')
            return self.sitemap_spider.get_urls()
        else:
            return []

    def external_link_check(self, external_urls):
        """Perform external link checking."""
        logging.info("Link checking %d external URLs..." % len(external_urls))

        for url in external_urls:
            url_parse = urlparse(url)
            if url_parse.scheme not in ("http", "https"):
                # We don't want to allow external URL checking of other
                # schemes (file:// for example)
                continue

            logging.info("Checking external URL %s" % url)

            result = linkchecker.check_url(url)
            if result is not None:
                broken_url = Url(url=url,
                                 scan=self.scan_object.webscan,
                                 status_code=result["status_code"],
                                 status_message=result["status_message"])
                broken_url.save()
                self.scanner_spider.associate_url_referrers(broken_url)

    def handle_closed(self, spider, reason):
        """Handle the spider being finished."""
        # TODO: Check reason for if it was finished, cancelled, or shutdown
        logging.debug('Spider is closing. Reason {0}'.format(reason))
        self.store_stats()
        reactor.stop()

    def store_stats(self):
        """Stores scrapy scanning stats when scan is completed."""
        logging.info('Stats: {0}'.format(
            self.scanner_spider.crawler.stats.get_stats()))

        try:
            statistics, created = Statistic.objects.get_or_create(
                scan=self.scanner.scan_object)
        except MultipleObjectsReturned:
            logging.error(
                'Multiple statistics objects found for scan job {}'.format(
                    self.scan_id))

        if self.scanner_spider.crawler.stats.get_value(
                'last_modified_check/pages_skipped'):
            statistics.files_skipped_count += self.scanner_spider.crawler.stats.get_value(
                'last_modified_check/pages_skipped')
        if self.scanner_spider.crawler.stats.get_value(
                'downloader/request_count'):
            statistics.files_scraped_count += self.scanner_spider.crawler.stats.get_value(
                'downloader/request_count')
        if self.scanner_spider.crawler.stats.get_value(
                'downloader/exception_type_count/builtins.IsADirectoryError'):
            statistics.files_is_dir_count += self.scanner_spider.crawler.stats.get_value(
                'downloader/exception_type_count/builtins.IsADirectoryError')

        statistics.save()
        logging.debug('Statistic saved.')

    def handle_error(self, failure, response, spider):
        """Handle spider errors, updating scan status."""
        logging.error("Scan failed: %s" % failure.getErrorMessage())
        self.store_stats()
        scan_object = Scan.objects.get(pk=self.scan_id)
        scan_object.reason = failure.getErrorMessage()
        scan_object.save()

    def handle_idle(self, spider):
        """Handle when the spider is idle.

        Keep it open if there are still queue items to be processed.
        """
        logging.debug("Spider Idle...")
        # Keep spider alive if there are still queue items to be processed
        remaining_queue_items = ConversionQueueItem.objects.filter(
            status__in=[
                ConversionQueueItem.NEW, ConversionQueueItem.PROCESSING
            ],
            url__scan=self.scan_object).count()

        if remaining_queue_items > 0:
            logging.info(
                "Keeping spider alive: %d remaining queue items to process" %
                remaining_queue_items)
            raise DontCloseSpider
        else:
            logging.info("No more active processors, closing spider...")
Exemple #47
0
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from ca_scraper.spiders.newspapers.contra_costa_times_spider import ContraCostaTimesSpider
from ca_scraper.spiders.newspapers.la_times_spider import LATimesSpider
from ca_scraper.spiders.newspapers.mercury_news_spider import MercuryNewsSpider
from ca_scraper.spiders.newspapers.oc_register_spider import OCRegisterSpider
from ca_scraper.spiders.newspapers.press_enterprise_spider import PressEnterpriseSpider
from ca_scraper.spiders.newspapers.sacramento_bee_spider import SacramentoBeeSpider
from ca_scraper.spiders.newspapers.san_diego_union_tribune_spider import SanDiegoUnionTribuneSpider
from ca_scraper.spiders.newspapers.san_francisco_chronicle_spider import SanFranciscoChronicleSpider

process = CrawlerProcess(get_project_settings())
process.crawl(ContraCostaTimesSpider)
process.crawl(LATimesSpider)
process.crawl(MercuryNewsSpider)
process.crawl(OCRegisterSpider)
process.crawl(PressEnterpriseSpider)
process.crawl(SacramentoBeeSpider)
process.crawl(SanDiegoUnionTribuneSpider)
process.crawl(SanFranciscoChronicleSpider)
process.start(
)  # the script will block here until all crawling jobs are finished
Exemple #48
0
def run_config(config):
    config = ConfigLoader(config)
    CustomMiddleware.driver = config.driver
    DocumentationSpider.NB_INDEXED = 0

    if config.use_anchors:
        from . import scrapy_patch

    strategy = DefaultStrategy(config)

    algolia_helper = AlgoliaHelper(
        config.app_id, config.api_key, config.index_name,
        AlgoliaSettings.get(config, strategy.levels))

    DOWNLOADER_MIDDLEWARES_PATH = 'scraper.src.custom_middleware.CustomMiddleware'
    DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.src.scrapy_patch.CustomContextFactory'

    if __name__ == '__main__':
        DOWNLOADER_MIDDLEWARES_PATH = 'src.custom_middleware.CustomMiddleware'
        DOWNLOADER_CLIENTCONTEXTFACTORY = 'src.scrapy_patch.CustomContextFactory'

    process = CrawlerProcess({
        'LOG_ENABLED':
        '1',
        'LOG_LEVEL':
        'ERROR',
        # 'LOG_LEVEL': 'DEBUG',
        'USER_AGENT':
        config.user_agent,
        'DOWNLOADER_MIDDLEWARES': {
            DOWNLOADER_MIDDLEWARES_PATH: 900
        },
        # Need to be > 600 to be after the redirectMiddleware
        'DOWNLOADER_CLIENTCONTEXTFACTORY':
        DOWNLOADER_CLIENTCONTEXTFACTORY
    })

    process.crawl(DocumentationSpider,
                  config=config,
                  algolia_helper=algolia_helper,
                  strategy=strategy)

    process.start()
    process.stop()

    # Kill browser if needed
    BrowserHandler.destroy(config.driver)

    if len(config.extra_records) > 0:
        algolia_helper.add_records(config.extra_records, "Extra records")

    if len(Camelizer.synonyms) > 0:
        algolia_helper.add_synonyms(Camelizer.synonyms)

    print("")

    if DocumentationSpider.NB_INDEXED > 0:
        algolia_helper.commit_tmp_index()
        print('Nb hits: ' + str(DocumentationSpider.NB_INDEXED))
        config.update_nb_hits(DocumentationSpider.NB_INDEXED)
    else:
        print('Crawling issue: nbHits 0 for ' + config.index_name)
        algolia_helper.report_crawling_issue()

    print("")
Exemple #49
0
def scrap_data():
    process = CrawlerProcess(get_project_settings())
    process.crawl('books')
    process.crawl('hozmart')
    process.start()
Exemple #50
0
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from edu_parse.spiders.autoyoula import AutoyoulaSpider

if __name__ == '__main__':
    crawler_settings = Settings()
    crawler_settings.setmodule("edu_parse.settings")
    crawler_proc= CrawlerProcess(settings=crawler_settings)
    crawler_proc.crawl(AutoyoulaSpider)
    crawler_proc.start()
    pass
Exemple #51
0
 def handle(self, *args, **options):
     process = CrawlerProcess(get_project_settings())
     process.crawl(KalerkanthoSpider)
     process.start()
Exemple #52
0
def foo():
    process = CrawlerProcess()
    process.crawl(RFA_spider)
    process.start()
Exemple #53
0
class Data_Spider:
    def __init__(self):
        self.process=CrawlerProcess(get_project_settings())
        self.db=DBSession()
        self.init_seed_data()
        #设置默认值
        # self.title_word=str(input('请输入学术讲座通知的匹配关键字:'))
        self.title = '报告题目:,学术报告:,题目,报告主题:,Title'        #(默认值)
        self.speaker = '报告人:,主讲人:,汇报人:,Speaker,报告专家'
        self.venue = '地点:,Address,Venue,Place'
        self.time = '日期:,时间:,Time'
        self.title_word=''

    # 初始化seed表格数据
    def init_seed_data(self):
        init=self.db.query(Seed).all()
        if len(init)==0:
            init_data=Seed()
            init_data.set_init_data(self.db)

    def set_college_url(self,college_url):
        # self.college_url=input('请输入需要爬取的学校的通知网址:')   #start_url
        self.college_url =college_url
    def set_college(self,college):
        self.college=college

    def set_next_xpath(self,next_xpath):
        self.next_xpath=next_xpath

    def set_url_xpath(self,url_xpath):
        self.url_xpath=url_xpath

    def set_text_xpath(self,text_xpath):
        self.text_xpath=text_xpath

    #多个关键词用","隔开
    def set_title_word(self):
        self.title_word=''

    def set_notify_time_xpath(self,notify_time_xpath):
        if len(notify_time_xpath)>0:
            self.notify_time_xpath=notify_time_xpath
        else:
            self.notify_time_xpath=''

    # 关键字设置,现已废除
    # def set_title(self,title):
    #     if len(title)>0:
    #         self.title=self.title+','+title
    #     self.title=self.title.replace(',',',')
    # def set_speaker(self,speaker):
    #     if len(speaker)>0:
    #         self.speaker=self.speaker+','+speaker
    #     self.speaker=self.speaker.replace(',',',')
    # def set_venue(self,venue):
    #     if len(venue)>0:
    #         self.venue=self.venue+','+venue
    #     self.venue = self.venue.replace(',', ',')
    # def set_time(self,time):
    #     if len(time)>0:
    #         self.time=self.time+','+time
    #     self.time = self.time.replace(',', ',')

    # def insert_seed(self,college_url):
    # def insert_seed_test(self):
    #     self.insert_seed()

    def insert_seed(self,db):
        # college_url=str(input('请输入需要爬取的学校的通知网址:'))
        # 设置图形化界面后忽略这一部分
        # self.set_college_url(college_url)
        # college = str(input('请输入需要爬取的学校(学院)的名称:'))
        # self.set_college(college)
        # next_xpath=str(input('请输入通知网站下一页的xpath选择器路径:'))
        # self.set_next_xpath(next_xpath)
        # url_xpath=str(input('请输入通知网站下每个具体网站超链接的xpath路径:'))
        # self.set_url_xpath(url_xpath)
        # text_xpath=str(input('请输入具体通知页面下,爬取通知正文每行文字的xpath路径:'))
        # self.set_text_xpath(text_xpath)
        # notify_time_xpath=str(input('请输入具体通知页面下,爬取通知时间的xpath路径,默认为空(不存在就不必输入):'))
        # self.set_notify_time_xpath(notify_time_xpath)
        # #上述五条信息必须输入,下面的信息可以选择性输入
        # title_word=str(input('请输入总通知页面下通知标题的字符匹配规则:(可选择不输入)'))
        # self.title_word=title_word
        # title=str(input('请输入报告标题的字符匹配规则:(可选择不输入)'))
        # self.set_title(title)
        # speaker = str(input('请输入报告人的字符匹配规则:(可选择不输入)'))
        # self.set_speaker(speaker)
        # venue = str(input('请输入报告地点的字符匹配规则:(可选择不输入)'))
        # self.set_venue(venue)
        # time = str(input('请输入报告时间的字符匹配规则:(可选择不输入)'))
        # self.set_time(time)
        try:
            seed=Seed(start_url= self.college_url,college= self.college,url_xpath= self.url_xpath,
                         nextpage_xpath= self.next_xpath,title_word= self.title_word,notice_time_xpath= self.notify_time_xpath,
                        # title=self.title, speaker=self.speaker, venue=self.venue, time=self.time,
                         text_xpath= self.text_xpath)
            db.add(seed)
            db.commit()
        except Exception as e:
            print(e)
            db.rollback()
            print('插入数据失败')

    #单个指定学校爬取
    def get_existed_urls(self,seed):
        existed_urls = []
        urls = self.db.query(Notification.url).filter(seed.college == Notification.college).all()
        # existed_urls=[]
        if len(urls)>0:
            for url in urls:
                existed_urls.append(url[0])
        return existed_urls

    #爬取学校学术信息通用流程
    def common_spider(self,seed):
        urlHandle=UrlHandle()
        existed_urls=self.get_existed_urls(seed)
        urlHandle.set_start_url(seed.start_url)
        urlHandle.set_title_word(seed.title_word)
        urlHandle.set_existed_urls(existed_urls)
        urlHandle.set_nextpage_xpath(seed.nextpage_xpath)
        urlHandle.set_url_xpath(seed.url_xpath)
        title_urls=urlHandle.get_filte_urls()
        selenium_spider = SeleniumSpider(seed, title_urls)
        selenium_spider.start_selenium()
        # self.process.crawl(NoticeSpider,seed,title_urls)
        # self.process.start()

    #单个学校学术信息爬取
    def university_spider(self,seed):
        # college_url=self.set_college_url()
        # seed = self.db.query(Seed).filter(Seed.start_url == college_url).one()
        if seed.start_url=='https://iiis.tsinghua.edu.cn/zh/seminars/':    #清华大学
            self.process.crawl(ThuIiisSpider)
            self.process.start()
        else:
            self.common_spider(seed)

    # 所有学校学术信息爬取,一次性爬取所有学校会出错
    def universities_spider(self):
        seeds=self.db.query(Seed).all()
        for seed in seeds:
            #对于每个学校直接调用单个学校爬取函数
            self.university_spider(seed)

    # def start_spider(self):
    #     is_one_spider=str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n'))
    #     while True:
    #         print(is_one_spider)
    #         if is_one_spider in ['y','Y','yes','Yes']:
    #             college_url = str(input('请输入需要爬取的学校的通知网址:'))
    #             seed = self.db.query(Seed).filter(Seed.start_url == college_url).all()
    #             if len(seed)==0:
    #                 seed=self.insert_seed(college_url)
    #                 self.university_spider(seed)
    #             else:
    #                 self.university_spider(seed[0])
    #             is_continue=str(input(('爬取完成,是否继续?y/n')))
    #             if is_continue in ['y','Y','yes','Yes']:
    #                 is_one_spider = str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n'))
    #             else:
    #                 break
    #         elif is_one_spider in ['n','no','No','N']:
    #             self.universities_spider()
    #             print('所有信息爬取完成!')
    #             break
    #         else:
    #             print('你的输入错误,请重新输入:')
    #             is_one_spider=str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n'))

#放在主程序执行
# spider=Data_Spider()
# spider.start_spider()

#请输入需要爬取的学校的通知网址:http://sist.swjtu.edu.cn/list.do?action=news&navId=40
# 请输入需要爬取的学校(学院)的名称:西南交通大学信息科学与技术学院
# 请输入通知网站下一页的xpath选择器路径://div[@class="tableFootLeft"]//a[text()="下一页"]
# 请输入通知网站下每个具体网站超链接的xpath路径://*[@id="rightPageContent"]/dl//dd
# 请输入具体通知页面下,爬取通知正文每行文字的xpath路径://*[@id="newsBody"]
# 请输入具体通知页面下,爬取通知时间的xpath路径,默认为空(不存在就不必输入)://*[@id="newsInfo"]

# http://cs.gzu.edu.cn/forum.php?mod=forumdisplay&fid=57&page=1
# 贵州大学计算机科学与技术学院
# url_xpath=//*[@id="newsList"]//p
# nextpage=//*[@id="bmbw0pgscl"]/div//a[text()='下一页']
# notify_time=//*[@id="ct"]/div[1]/div/div[1]/p
# 通知全文=//td[@class="t_f"]
Exemple #54
0
import json

class MainSpider(scrapy.Spider):
    
    name = 'main'
    # allowed_domains = ['longandfoster.com']
    
    start_urls = ['https://www.longandfoster.com/include/ajax/api.aspx?op=SearchAgents&firstname=&lastname=&page=1&pagesize=200']

    def parse(self, response):
        resp = json.loads(json.loads(response.body)['Entity'])
        for each in resp:
            name = each.get('DisplayName')

            yield {
                "Name": name,
            }

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
    # save in file CSV, JSON or XML
    'FEED_FORMAT': 'csv',     # csv, json, xml
    'FEED_URI': 'output.csv', #
})
c.crawl(MainSpider)
c.start() 
    def handle(self, *args, **options):
        process = CrawlerProcess(get_project_settings())

        process.crawl(HemnetSpider.HemnetSpider)
        process.start()
Exemple #56
0
def run_config(config):
    config = ConfigLoader(config)
    CustomDownloaderMiddleware.driver = config.driver
    DocumentationSpider.NB_INDEXED = 0

    strategy = DefaultStrategy(config)

    meilisearch_helper = MeiliSearchHelper(config.app_id, config.api_key,
                                           config.index_uid,
                                           config.custom_settings)

    root_module = 'src.' if __name__ == '__main__' else 'scraper.src.'
    DOWNLOADER_MIDDLEWARES_PATH = root_module + 'custom_downloader_middleware.' + CustomDownloaderMiddleware.__name__
    DUPEFILTER_CLASS_PATH = root_module + 'custom_dupefilter.' + CustomDupeFilter.__name__

    headers = {
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en",
    }  # Defaults for scrapy https://docs.scrapy.org/en/latest/topics/settings.html#default-request-headers

    if os.getenv("CF_ACCESS_CLIENT_ID") and os.getenv(
            "CF_ACCESS_CLIENT_SECRET"):
        headers.update({
            "CF-Access-Client-Id":
            os.getenv("CF_ACCESS_CLIENT_ID"),
            "CF-Access-Client-Secret":
            os.getenv("CF_ACCESS_CLIENT_SECRET"),
        })
    elif os.getenv("IAP_AUTH_CLIENT_ID") and os.getenv(
            "IAP_AUTH_SERVICE_ACCOUNT_JSON"):
        iap_token = IAPAuth(
            client_id=os.getenv("IAP_AUTH_CLIENT_ID"),
            service_account_secret_dict=json.loads(
                os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON")),
        )(requests.Request()).headers["Authorization"]
        headers.update({"Authorization": iap_token})
    elif os.getenv("KC_URL") and os.getenv("KC_REALM") and os.getenv(
            "KC_CLIENT_ID") and os.getenv("KC_CLIENT_SECRET"):
        realm = KeycloakRealm(server_url=os.getenv("KC_URL"),
                              realm_name=os.getenv("KC_REALM"))
        oidc_client = realm.open_id_connect(
            client_id=os.getenv("KC_CLIENT_ID"),
            client_secret=os.getenv("KC_CLIENT_SECRET"))
        token_response = oidc_client.client_credentials()
        token = token_response["access_token"]
        headers.update({"Authorization": 'bearer ' + token})

    DEFAULT_REQUEST_HEADERS = headers

    process = CrawlerProcess({
        'LOG_ENABLED':
        '1',
        'LOG_LEVEL':
        'ERROR',
        'USER_AGENT':
        config.user_agent,
        'DOWNLOADER_MIDDLEWARES': {
            DOWNLOADER_MIDDLEWARES_PATH: 900
        },
        # Need to be > 600 to be after the redirectMiddleware
        'DUPEFILTER_USE_ANCHORS':
        config.use_anchors,
        # Use our custom dupefilter in order to be scheme agnostic regarding link provided
        'DUPEFILTER_CLASS':
        DUPEFILTER_CLASS_PATH,
        'DEFAULT_REQUEST_HEADERS':
        DEFAULT_REQUEST_HEADERS,
    })

    process.crawl(DocumentationSpider,
                  config=config,
                  meilisearch_helper=meilisearch_helper,
                  strategy=strategy)

    process.start()
    process.stop()

    # Kill browser if needed
    BrowserHandler.destroy(config.driver)

    if len(config.extra_records) > 0:
        meilisearch_helper.add_records(config.extra_records, "Extra records",
                                       False)

    print("")

    if DocumentationSpider.NB_INDEXED > 0:
        # meilisearch_helper.commit_tmp_index()
        print('Nb hits: {}'.format(DocumentationSpider.NB_INDEXED))
    else:
        print('Crawling issue: nbHits 0 for ' + config.index_uid)
        # meilisearch_helper.report_crawling_issue()
        sys.exit(EXIT_CODE_NO_RECORD)
    print("")
Exemple #57
0
def validate():
    process = CrawlerProcess(settings=settings)
    process.crawl(ValidatorSpider)
    process.start()
Exemple #58
0
def main():
    process = CrawlerProcess(settings=get_project_settings())
    process.crawl(OpenlibraryLoginSpider)
    process.start()
Exemple #59
0
from scrapy.crawler import Crawler, CrawlerProcess
from scraper import IMDbTop1000Spider
from index import IMDbIndex
from flask import Flask, request

app = Flask(__name__)

# Crawl the service using our spider and store it in a list
movies = []


def collect_items(item, response, spider):
    movies.append(item)


crawler = Crawler(IMDbTop1000Spider)
crawler.signals.connect(collect_items, signals.item_scraped)
process = CrawlerProcess()
process.crawl(crawler)
process.start()  # block until finished

# Index this data to Whoosh
imdb_index = IMDbIndex()
imdb_index.bulk_index(movies)


@app.route('/search', methods=['GET'])
def index():
    search_term = request.args.get('q', type=str)
    return imdb_index.search(search_term)
Exemple #60
0
        }

        stock = VietstockItem()
        stock['date'] = items['date']
        stock['time'] = items['time']
        stock['stock_name'] = items['stock_name']
        stock['price'] = items['price']
        yield stock


def run_crawl():
    runner = CrawlerRunner({
        'USER_AGENT':
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    })
    deferred = runner.crawl(StockSpider)
    # you can use reactor.callLater or task.deferLater to schedule a function
    deferred.addCallback(reactor.callLater, 5, run_crawl)
    return deferred


if __name__ == "__main__":
    process = CrawlerProcess(get_project_settings())
    scheduler = TwistedScheduler()
    scheduler.add_job(process.crawl,
                      'interval',
                      args=[StockSpider],
                      seconds=10)
    scheduler.start()
    process.start(False)