def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str): busy.value = 1 if os.path.exists("data.json"): os.remove("data.json") print("Started crawling task") process = CrawlerProcess(get_project_settings()) process.crawl("od_links", base_url=website.url) process.start() print("Done crawling") self.db.import_json("data.json", website) os.remove("data.json") print("Imported in SQLite3") if post_id: # Reply to post stats = self.db.get_website_stats(website.id) comment = self.reddit_bot.get_comment({"": stats}, website.id) print(comment) if "total_size" in stats and stats["total_size"] > 10000000: post = self.reddit_bot.reddit.submission(post_id) self.reddit_bot.reply(post, comment) pass else: self.reddit_bot.log_crawl(post_id) elif comment_id: # Reply to comment stats = self.db.get_website_stats(website.id) comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id) print(comment) reddit_comment = self.reddit_bot.reddit.comment(comment_id) self.reddit_bot.reply(reddit_comment, comment) busy.value = 0 print("Done crawling task")
def service_sis(self): process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(worker.Worker) process.start() # the script will block here until the crawling is finished
def main(): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item dispatcher.connect(catch_item, signal=signals.item_passed) # shut off log from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.start() # schedule spider # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
def runSpiderProcess(spider_cls, *args, **kwargs): """ Helper method that starts a spider with the given init arguments, waits for it to complete, and returns the items it yielded in a list. :param spider_cls: the spider class to run :param args: the indexed arguments to the spider :param kwargs: the keyword arguments to the spider :return: a list of items yielded by the spider """ process = CrawlerProcess() process.crawl(spider_cls, *args, **kwargs) final_result = [] def _nab_item(item): # FIXME: this silly dance of encoding and decoding is to prevent scrapy items from being returned to celery # FIXME: celery can't serialize them, so it throws a rather opaque error, but it's fine with lists and dicts final_result.append(json.loads(scrapy_encoder.encode(item))) for crawler in process.crawlers: crawler.signals.connect(_nab_item, item_scraped) process.start() process.stop() return final_result
def main(argv): try: opts, args = getopt.getopt(argv, "ch:t:s:", ['title=', 'section=']) except getopt.GetoptError: print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>' sys.exit() elif opt == '-c': # start crawling article here print "crawling" process = CrawlerProcess(get_project_settings()) process.crawl(BBCArticleSpider) process.start() elif opt in ('-t', '--title'): print "search by title" # start searching article by title results = BBCArticleItem.fetch_by_title(arg) for result in results: print result elif opt in ('-s', '--section'): print "search by section" # start searching article by section results = BBCArticleItem.fetch_by_section(arg) for result in results: print result
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback, scrapy_settings=None): """ Launch crawl job for JobSpider class :param scrapy_settings: dict of setting merged with CrawlerProcess default settings :param debug: (bool) Activate or disable debug :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error) :param connector: Connector instance :param spiders_classes: JobSpider class list :return: spider instance """ if debug: dispatcher.connect(spider_error_callback, signals.spider_error) settings = { 'ITEM_PIPELINES': { 'pyjobs_crawlers.pipelines.RecordJobPipeline': 1, }, 'connector': connector, 'LOG_ENABLED': False, 'DOWNLOAD_DELAY': 1 if not debug else 0, } if scrapy_settings: settings.update(scrapy_settings) process = CrawlerProcess(settings) for spider_class in spiders_classes: process.crawl(spider_class, debug=debug) spiders = [] for crawler in list(process.crawlers): spiders.append(crawler.spider) process.start() return spiders
def main(): """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Item Extraido:", item dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # definir el spider para el crawler crawler.crawl(BloggerSpider()) # iniciar scrapy print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
class CrawlerScript(): def __init__(self): settings = get_project_settings() settings.set('LOG_ENABLED', False, priority='cmdline') #settings.overrides['LOG_ENABLED'] = False self.crawler = CrawlerProcess(settings) self.items = [] SignalManager(dispatcher.Any).connect(self._item_passed, signal=signals.item_scraped) def _item_passed(self,item,response,spider): self.items.append(item) def _crawl(self, q, queue): self.crawler.crawl(BingSpider, q=q) self.crawler.start() self.crawler.stop() queue.put(self.items) def crawl(self, q): queue = Queue() p = Process(target=self._crawl, args=[q, queue]) p.start() p.join() return queue.get(True)
def get_scraped_sites_data(): """Returns output for venues which need to be scraped.""" class RefDict(dict): """A dictionary which returns a reference to itself when deepcopied.""" def __deepcopy__(self, memo): return self # Hack: we pass a dictionary which can't be deep-copied into the settings # so as to _return_ the scraper output. As far as I can tell, this is the # only way to return the scraper output to the script itself. output = RefDict() settings = Settings({ 'LOG_ENABLED': False, 'ITEM_PIPELINES': { 'mgrok.pipelines.JsonWriterPipeline': 1 }, 'PIPELINE_OUTPUT': output, 'USER_AGENT': 'Chrome/41.0.2228.0' }) crawler_process = CrawlerProcess(settings) for spider in SCRAPY_SPIDERS: crawler_process.crawl(spider) crawler_process.start() return output
def run(urls, city): process = CrawlerProcess() spiders = [make_spider(artist, url, city) for artist, url in urls] for spider_cls in spiders: process.crawl(spider_cls) # the script will block here until the crawling is finished process.start()
class CrawlerScript(): def __init__(self): self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def _crawl(self, queue, spider_name): spider = self.crawler.spiders.create(spider_name) if spider: self.crawler.queue.append_spider(spider) self.crawler.start() self.crawler.stop() queue.put(self.items) def crawl(self, spider): queue = Queue() p = Process(target=self._crawl, args=(queue, spider,)) p.start() p.join() return queue.get(True)
def handle(self, *args, **options): # It would be better to pass this in as a parameter to PayoutSpider global start_date start_date = datetime.datetime(2015, 1, 1, tzinfo=pytz.UTC) delete = options.get('delete') delete_all = options.get('delete_all') retrieve_all = options.get('retrieve_all') previous_payout = None previous_payouts = codementor_models.Payout.objects.all().order_by('-date') if delete_all or (delete and previous_payouts.count() == 0): codementor_models.Review.objects.all().delete() codementor_models.Session.objects.all().delete() codementor_models.Payout.objects.all().delete() codementor_models.Payment.objects.all().delete() elif delete: previous_payout = previous_payouts[0] codementor_models.Review.objects.filter(date__gt=start_date).delete() codementor_models.Session.objects.filter(started_at__gt=start_date).delete() previous_payout.delete() codementor_models.Payment.objects.filter(payout__isnull=True).delete() if not retrieve_all and previous_payout: start_date = previous_payout.date process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(PayoutSpider) process.start()
def run_spider(spider, settings): """Run a spider with given settings""" from scrapy import signals from scrapy.xlib.pydispatch import dispatcher from scrapy.settings import CrawlerSettings def catch_item(sender, item, **kwargs): #log.msg("Got:" + str(item)) pass dispatcher.connect(catch_item, signal=signals.item_passed) """clean storage""" scraperwiki.sqlite.execute("drop table if exists "+spider.name) scraperwiki.sqlite.commit() from scrapy.crawler import CrawlerProcess settings = CrawlerSettings(values=settings) crawler = CrawlerProcess(settings) crawler.install() crawler.configure() crawler.crawl(spider) #log.start(loglevel='DEBUG') crawler.start()
def run(self): settings = get_project_settings() process = CrawlerProcess(settings) process.crawl('stackoverflow', ) process.start()
def get(self): while True: process = CrawlerProcess(get_project_settings()) process.crawl('iqiyi') process.start() time.sleep(3000) self.finish()
def spiderCrawl(bandname): createLink(bandname) settings = get_project_settings() settings.set('USER_AGENT','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)') process = CrawlerProcess(settings) process.crawl(MySpider) process.start()
def startSpiderTest(group_type,spider_type,spider_group_name,spider_name): #调用Scrapy内部方法 settings = get_project_settings() #实例化一个爬虫进程 crawlerProcess = CrawlerProcess(settings) #创建一个爬虫,一个爬取处理器可以,运行多个爬取。 crawler = crawlerProcess.create_crawler(spider_name) #设置爬虫的状态。 当爬虫发出该信号后,调用响应的方法。 crawler.signals.connect(spiderSignal.startSingnal, signals.spider_opened) crawler.signals.connect(spiderSignal.errorSingnal, signals.spider_error) crawler.signals.connect(spiderSignal.stopSingnal, signals.spider_closed) #获取爬取类 spiderConf = Spider_Dict[group_type][spider_type] spiderArgs = spiderConf[1].copy() spiderArgs["name"] = spider_name spiderArgs["redis_key"] = spider_name spiderArgs["spider_type"] = spider_type spiderArgs["spider_group_name"] = spider_group_name spiderArgs["task_id"] = "-1" spider = spiderConf[0](**spiderArgs) #给爬虫设置爬取类 crawler.configure() crawler.crawl(spider) #爬虫启动。 crawlerProcess.start() crawlerProcess.stop()
class MySpiderProcess1(scrapy.Spider): def __init__(self, name, urls): self.name = name self.start_urls = urls scrapy.Spider.__init__(self) def parse(self, response): print('parse response') def _crawl(self): settings = Settings() settings.set('ITEM_PIPELINES', { 'app.pipelines.JsonWriterPipeline': 300 }) self.process = CrawlerProcess(settings) self.process.crawl(self, self.name, self.start_urls) self.process.start() # self.process.stop() # self.process.join() def start(self): p = Process(target=self._crawl) p.start() p.join() # # def start(self): # self._crawl() def stop(self): self.process.stop()
def handle(self, *args, **options): from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = True from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() from alescspider.spiders import * spiders = [deputado_spider.DeputadoSpider()] #spiders = [presenca_spider.PresencaSpider(), votos_spider.VotosSpider(), deputado_spider.DeputadoSpider()] for spider in spiders: crawler.queue.append_spider(spider) print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
def main(): """Index alexa demographics """ engine = db_connect() Session = sessionmaker(bind=engine) session = Session() settings = get_project_settings() settings.set('ITEM_PIPELINES', {'demographic_scraper.demographic_scraper.pipelines.WebsiteDemographicPipeline': 300}) settings.set('EXTENSIONS', {'scrapy.telnet.TelnetConsole': None,}) process = CrawlerProcess(settings) for website in session.query(WebsitesContent).all(): demographic = list(session.query(Websites).filter_by(link=website.link)) if len(demographic) is 0: url = website.link print website.link AlexaSpider.name = url process.crawl(AlexaSpider, url=url, db_session=session) process.start() process.stop() session.close()
def scrape(spider): with transaction.atomic(), reversion.create_revision(): process = CrawlerProcess(DEFAULT_CRAWLER_OPTIONS) process.crawl(spider) # the script will block here until the crawling is finished process.start() return
def handle(self, *args, **options): setting = { 'USER_AGENT': options['user_agent'], 'DOWNLOAD_DELAY': options['download_delay'], 'LOG_FILE': settings.SCRAPY_LOG_FILE, 'LOG_LEVEL': settings.SCRAPY_LOG_LEVEL, } if options['proxy_list']: try: f = open(options['proxy_list']) except IOError as e: raise CommandError('cannot open proxy list file for read') # Retry many times since proxies often fail setting['RETRY_TIMES'] = 10 # Retry on most error codes since proxies fail for different reasons setting['RETRY_HTTP_CODES'] = [500, 503, 504, 400, 403, 404, 408] setting['DOWNLOADER_MIDDLEWARES'] = { 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, 'spider.randomproxy.RandomProxy': 100, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, } setting['PROXY_LIST'] = options['proxy_list'] process = CrawlerProcess(setting) process.crawl(BaiduSpider) process.start()
def __init__(self, titlesfile = None, platform = None, region = None): # set default encoding to utf8 for parsing and logging # utf-8 characters in console and files # reload(sys) sys.setdefaultencoding('utf8') configure_logging(install_root_handler=False) logging.basicConfig( filename='export.log', filemode = 'a', format='%(levelname)s: %(message)s', level=logging.INFO ) # identify platform # self.platform = platform if self.platform is None: logging.error('No platform found! Pass it as an argument.') return else: platformId = platforms.getId(self.platform) if platformId is None: logging.error('Platform ' + self.platform + ' not supported.') return self.titlesfile = titlesfile self.region = region if self.region is None: self.region = "Worldwide" if titlesfile: titles = [] urls = [] with open( self.titlesfile ) as f: titles = f.read().splitlines() for title in titles: logging.debug('Submitting title:' + title ) urls.append( 'http://mobygames.com/search/quick' + '?q=' + title + '&p=' + platformId + '&search=Go' '&sFilter=1' '&sG=on' '&search_title=' + urllib.quote( title ) + '&search_platform=' + urllib.quote(self.platform) + '&search_region=' + urllib.quote(self.region) ) process = CrawlerProcess(get_project_settings()) process.crawl(MobygamesSpider, start_urls=urls) process.start() else: logging.warning('No file.')
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback): """ Launch crawl job for JobSpider class :param debug: (bool) Activate or disable debug :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error) :param connector: Connector instance :param spiders_classes: JobSpider class list :return: spider instance """ if debug: dispatcher.connect(spider_error_callback, signals.spider_error) process = CrawlerProcess({ 'ITEM_PIPELINES': { 'pyjobs_crawlers.pipelines.RecordJobPipeline': 1, }, 'connector': connector, 'LOG_ENABLED': False }) for spider_class in spiders_classes: process.crawl(spider_class) spiders = [] for crawler in list(process.crawlers): spiders.append(crawler.spider) process.start() return spiders
def _crawl(path=None): crawl = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) crawl.crawl(ProvinceSpider) crawl.start() crawl.stop()
def Test_Scapy(self): spider = FtpSpider() process = CrawlerProcess({"USER_AGENT": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"}) process.crawl(spider) process.start()
def scrapeando(): from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): """Rellenamos la BD""" for i in enumerate(item.items()): x = i[0] query = "INSERT INTO book (Nombre ,Autor, Editorial ,Fecha, Precio, Link) VALUES ("+decodifica(item['Nombre'][x])+","+decodifica(item['Autor'][x])+","+decodifica(item['Editorial'][x])+","+decodifica(item['Fecha'][x])+","+decodifica(item['Precio'][x])+","+decodifica("http://www.casadellibro.com"+item['Link'][x])+");" db.micursor.execute(query) db.conexion.commit() print item dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() book = BookSpider() book.busqueda=unicode(search.getbusqueda()) crawler.crawl(book) print "Start scraping to la Casa del Libro" crawler.start() print "End scraping to la Casa del Libro" crawler.stop()
def crawl(ctx, spiders, stats): """ Crawl one or many or all pages. What spider(s) to run is determined in the following order: 1. Spider(s) given as argument(s) 2. Spider(s) specified in the configuration file Note that if a spider is given as an argument, the spiders in the configuration file are ignored. All available spiders will be used to crawl if no arguments are given and no spiders are configured. """ settings = ctx.obj["settings"] if stats: settings.set("STATS_CLASS", "scrapy.statscollectors.MemoryStatsCollector") # Start a new crawler process. process = CrawlerProcess(settings) spiders = spiders_to_crawl(process, spiders) if not spiders: logger.error("Please specify what spiders you want to run!") else: for spider in spiders: logger.info("Starting crawl of {} ...".format(spider)) process.crawl(spider) process.start() if settings.getbool("HTTPCACHE_ENABLED"): run_cleanup_cache(settings)
def ScrapeSite(): db = 'crunchbase_startups' sitedomain = raw_input("Enter site domain: ") # get user input sitedomain = parse_base_url(sitedomain) # clean url sql = 'SELECT text FROM {} WHERE siteurl = %s'.format(db) cur.execute(sql, sitedomain) sitetext = cur.fetch() if sitetext != '': # what does an empty ping return? print 'Site already scraped.' return sitetext process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'ITEM_PIPELINES': {'pipelines.UserInputPipeline': 100}, 'DEPTH_LIMIT': 2, 'DOWNLOAD_HANDLERS': {'s3': None,} ,'LOG_LEVEL': 'INFO' }) process.crawl(SoloSpider, domain = sitedomain) process.start() # presumably finished here - pull newly loaded sitetext for domain cur.execute(sql, sitedomain) return cur.fetch()
def main(): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item options = parse_args() dispatcher.connect(catch_item, signal=signals.item_passed) # shut off log from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = True settings.overrides['DEPTH_LIMIT'] = 2 # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # schedule spider spider = MySpider(input=options.input, output=options.output) crawler.queue.append_spider(spider) # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
def execute(tags_urls): process = CrawlerProcess({'LOG_LEVEL': LOG_LEVEL}) process.crawl(TagSpider, tags=';'.join(tags_urls)) process.start() process.stop()
import scrapy import sys, getopt from scrapy.crawler import CrawlerProcess from ypscraper.spiders.yellowpages import YellowpagesSpider from scrapy.utils.project import get_project_settings process = CrawlerProcess(get_project_settings()) process.crawl('yellowpages', max_listings='100', infile='searches.json') process.start()
#process.crawl(fun.FunSpider()) #process.start(stop_after_crawl=False) process1 = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', #'DOWNLOAD_DELAY': 1, #'RETRY_HTTP_CODES': {500, 502, 503, 504, 522, 524, 408, 456}, #'CONCURRENT_REQUESTS_PER_IP': 32, #'CONCURRENT_REQUESTS_PER_DOMAIN':32, #'COOKIES_ENABLED': False #'COOKIES_ENABLED': False }) from stock import ticker ticker.TickerTodayPriceSpider.CONCURRENT_REQUESTS_PER_IP = 4 process1.crawl(ticker.TickerTodayPriceSpider()) process1.crawl(ticker.TickerTodayPriceSpider1()) process1.crawl(ticker.TickerTodayPriceSpider2()) process1.crawl(ticker.TickerTodayPriceSpider3()) from stock import full_price_update process1.crawl(full_price_update.FullPriceSpider()) from stock import min_price_update process1.crawl(min_price_update.MinPriceDailySpider()) process1.start()
def test_37(self): # Basic, useful data local_dir = self.sitemaps_spider.local_dir website_dir = self.sitemaps_spider.website_folder # Disable useless messages from the engine. # To be fair, they can be really useful in usual development context, # but here they fill up our tests output. # Based on: https://stackoverflow.com/a/33204694 logging.getLogger('scrapy').setLevel(logging.WARNING) logging.getLogger('scrapy').propagate = False # Single element sitemap crawler_process = CrawlerProcess() crawler1 = CrawlerWithResults(self.sitemaps_spider) crawler_process.crawl(crawler1) crawler1.spider.sitemap_urls = [ local_dir + website_dir + "/sitemap1.xml" ] crawler1.spider.name += "1" # Multiple element sitemap crawler2 = CrawlerWithResults(self.sitemaps_spider) crawler_process.crawl(crawler2) crawler2.spider.sitemap_urls = [ local_dir + website_dir + "/sitemap2.xml" ] crawler2.spider.name += "2" # Multiple sitemaps within a sitemap crawler3 = CrawlerWithResults(sitemaps.LocalSitemapsSpider) crawler_process.crawl(crawler3) crawler3.spider.sitemap_urls = [ local_dir + website_dir + "/sitemap3.xml" ] crawler3.spider.name += "3" # Multiple sitemaps within a sitemap crawler4 = CrawlerWithResults(sitemaps.LocalSitemapsSpider) crawler_process.crawl(crawler4) crawler4.spider.sitemap_urls = [ local_dir + website_dir + "/sitemap4.xml" ] crawler4.spider.name += "4" # We can't run multiple processes in one script, due to the Twisted Reactor. # This is why we test everything in a single test unit. # Kind of bad, but it is the only way. crawler_process.start() # Check that all the tests are good # Single page sitemap self.assertEqual(crawler1.items, [{'1': '1'}]) # Multiple page sitemap self.assertEqual( sorted( (key, item[key]) for item in crawler2.items for key in item), [(str(x), str(x)) for x in range(2, 7)]) # Sitemaps within a sitemap self.assertEqual( sorted( (key, item[key]) for item in crawler3.items for key in item), [(str(x), str(x)) for x in range(1, 7)]) # Compressed sitemaps within a sitemap self.assertEqual( sorted( (key, item[key]) for item in crawler4.items for key in item), [(str(x), str(x)) for x in range(1, 7)])
def crawl_article_pro(): process = CrawlerProcess(get_project_settings()) process.crawl(ArticleSpider) process.start()
def craw(repositories): process = CrawlerProcess(get_project_settings()) process.crawl('repositories', repositories=repositories) process.start()
#) #configure_logging({'LOG_STDOUT': True}) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--max_page', default=1, type=int) parser.add_argument('--cat_id', default='explore', type=str) parser.add_argument('--cat_name', default='话题精选', type=str) parser.add_argument('--start_url', default='/group/explore', type=str) parser.add_argument('--url_keywords', default='topic', type=str) parser.add_argument('--sleep', default=3, type=int) parser.add_argument('--only_image', default=0, type=int) args = parser.parse_args() settings = get_project_settings() settings.set('MAX_PAGE', args.max_page, 'project') settings.set('CAT_ID', args.cat_id, 'project') settings.set('CAT_NAME', args.cat_name, 'project') settings.set('START_URL', args.start_url, 'project') settings.set('URL_KEYWORDS', args.url_keywords, 'project') settings.set('DOWNLOAD_DELAY', args.sleep, 'project') settings.set('ONLY_IMAGE', args.only_image, 'project') return settings if __name__ == "__main__": settings = parse_args() crawler_process = CrawlerProcess(settings) crawler_process.crawl(DoubanScrapy) crawler_process.start()
"""Small spider for downloading large files from gdrive""" name = 'Google Drive Large File Downloader' def __init__(self, url, file_name): self.file_name = file_name self.start_urls = [url] def parse(self, response): """Parses Google's warning page.""" downlaod_url = 'https://drive.google.com' + \ response.xpath('//div[@class="uc-main"]' + '/div[@id="uc-text"]/a/@href').extract()[0] yield Request(url=downlaod_url, callback=self.save_file, meta={'download_maxsize' : 0, 'download_timeout' : 1200}) def save_file(self, response): """Saves downloaded file.""" with open(self.file_name, 'wb') as large_file: large_file.write(response.body) if __name__ == "__main__": # Reduce Scrapy logger verbosity. logging.disable(logging.WARNING) spiderproc = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) spiderproc.crawl(GoogleDriveSpider, url=sys.argv[1], file_name=sys.argv[2]) spiderproc.start()
'------------------------------ split ------------------------------' ) import pprint pprint.pprint(d) yield d # 配置在单脚本情况也能爬取的脚本的备选方案,使用项目启动则下面的代码无效 if __name__ == '__main__': import os, time from scrapy.crawler import CrawlerProcess timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime()) # 年月日_时分秒 filename = 'v{}.json'.format(timestamp) # 这是输出文件名字(解开 'FEED_URI' 配置注释生效) jobdir = 'JOBDIR/kiCYjoVAmJ' # 这是队列信息地址(解开 'JOBDIR' 配置注释生效) p = CrawlerProcess({ 'TELNETCONSOLE_ENABLED': False, # 几乎没人使用到这个功能,直接关闭提高爬虫启动时间 'MEDIA_ALLOW_REDIRECTS': True, # 允许图片下载地址重定向,存在图片下载需求时,请尽量使用该设置 'LOG_LEVEL': 'INFO', # DEBUG , INFO , WARNING , ERROR , CRITICAL # 'JOBDIR': jobdir, # 解开注释则增加断点续爬功能 # 任务队列、任务去重指纹、任务状态存储空间(简单来说就是一个文件夹) # 'FEED_URI': filename, # 下载数据到文件 # 'FEED_EXPORT_ENCODING': 'utf-8', # 在某种程度上,约等于 ensure_ascii=False 的配置选项 # 'FEED_FORMAT': 'json', # 下载的文件格式,不配置默认以 jsonlines 方式写入文件, # 支持的格式 json, jsonlines, csv, xml, pickle, marshal # 'DOWNLOAD_TIMEOUT': 8, # 全局请求超时,默认180。也可以在 meta 中配置单个请求的超时( download_timeout ) # 'DOWNLOAD_DELAY': 1, # 全局下载延迟,这个配置相较于其他的节流配置要直观很多 }) p.crawl(VSpider) p.start()
def run_scraper(): process = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) process.crawl(HNScrapy) process.start()
import scrapy import sys sys.path.append( '..') #TODO add example directory to sys.path without this command from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from Diversity import DiversityCrawler from Rent import RentCrawler from Transport import TransportCrawler if __name__ == '__main__': process = CrawlerProcess(get_project_settings()) process.crawl(RentCrawler) process.crawl(DiversityCrawler) process.crawl(TransportCrawler) process.start(stop_after_crawl=True)
") .price::text").extract() fjson["relative_img_link"] = response.xpath( "//div[@class ='image-gradient']/img/@src").extract()[t] fjson["abs"] = base_url + fjson["relative_img_link"] #fjson['image_urls'] = [url_join_imgz(base_url,relative_img_link)] for t in relative_img_link ] #fjson["relative_img_link"] = response.xpath("//div[@class ='image-gradient']/img/@src").extract()[t] fjson["tags"] = response.css(".CampaignPackages-items:nth-child(" + str(t) + ") .tag-names::text").extract() fjson["img_link2"] = response.css( ".CampaignPackages-items:nth-child(" + str(t) + ") .lazyloaded").extract() print(fjson) print( "********************************************************************" ) print(t) with open(base_path + 'data.csv', "a") as fo: fo.write("\n" + str(fjson)) fo.flush() process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G960F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36' }) process.crawl(SatsaSpider) process.start() # the script will block here until the crawling is finished
from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings settings = get_project_settings() crawler = CrawlerProcess(settings) crawler.crawl('ch3.18-email') # 爬虫名 crawler.start()
def start_spider(self): p=CrawlerProcess(settings=self.settings) p.crawl(JudgementSpider) p.start()
def main(): process = CrawlerProcess(get_project_settings()) process.crawl('PTTCrawler', urls_txt_path='./target_urls.txt', output_path='test.json') process.start()
class ScannerApp: """A scanner application which can be run.""" def __init__(self): """ Initialize the scanner application. Takes input, argv[1], which is directly related to the scan job id in the database. Updates the scan status and sets the pid. """ self.scan_id = sys.argv[1] # Get scan object from DB self.scan_object = Scan.objects.get(pk=self.scan_id) self.scan_object.set_scan_status_start() self.scanner = Scanner(self.scan_id) def run(self): """Run the scanner, blocking until finished.""" settings = get_project_settings() self.crawler_process = CrawlerProcess(settings) if hasattr(self.scan_object, 'webscan'): self.start_webscan_crawlers() else: self.start_filescan_crawlers() # Update scan status self.scan_object.set_scan_status_done() def start_filescan_crawlers(self): self.sitemap_spider = None self.scanner_spider = self.setup_scanner_spider() self.start_crawlers() def start_webscan_crawlers(self): # Don't sitemap scan when running over RPC or if no sitemap is set on scan if not self.scan_object.scanner.process_urls: if len(self.scanner.get_sitemap_urls()) is not 0\ or len(self.scanner.get_uploaded_sitemap_urls()) is not 0: self.sitemap_spider = self.setup_sitemap_spider() else: self.sitemap_spider = None else: self.sitemap_spider = None self.scanner_spider = self.setup_scanner_spider() self.start_crawlers() if (self.scan_object.webscan.do_link_check and self.scan_object.webscan.do_external_link_check): # Do external link check self.external_link_check(self.scanner_spider.external_urls) def start_crawlers(self): # Run the crawlers and block logging.info('Starting crawler process.') self.crawler_process.start() logging.info('Crawler process started.') def handle_killed(self): """Handle being killed by updating the scan status.""" # self.scan_object = Scan.objects.get(pk=self.scan_id) self.scan_object.set_scan_status_failed() self.scan.logging_occurrence("SCANNER FAILED: Killed") logging.error("Killed") def setup_sitemap_spider(self): """Setup the sitemap spider.""" crawler = self.crawler_process.create_crawler(SitemapURLGathererSpider) self.crawler_process.crawl( crawler, scanner=self.scanner, runner=self, sitemap_urls=self.scanner.get_sitemap_urls(), uploaded_sitemap_urls=self.scanner.get_uploaded_sitemap_urls(), sitemap_alternate_links=True) return crawler.spider def setup_scanner_spider(self): """Setup the scanner spider.""" crawler = self.crawler_process.create_crawler(ScannerSpider) crawler.signals.connect(self.handle_closed, signal=signals.spider_closed) crawler.signals.connect(self.handle_error, signal=signals.spider_error) crawler.signals.connect(self.handle_idle, signal=signals.spider_idle) self.crawler_process.crawl(crawler, scanner=self.scanner, runner=self) return crawler.spider def get_start_urls_from_sitemap(self): """Return the URLs found by the sitemap spider.""" if self.sitemap_spider is not None: logging.debug('Sitemap spider found') return self.sitemap_spider.get_urls() else: return [] def external_link_check(self, external_urls): """Perform external link checking.""" logging.info("Link checking %d external URLs..." % len(external_urls)) for url in external_urls: url_parse = urlparse(url) if url_parse.scheme not in ("http", "https"): # We don't want to allow external URL checking of other # schemes (file:// for example) continue logging.info("Checking external URL %s" % url) result = linkchecker.check_url(url) if result is not None: broken_url = Url(url=url, scan=self.scan_object.webscan, status_code=result["status_code"], status_message=result["status_message"]) broken_url.save() self.scanner_spider.associate_url_referrers(broken_url) def handle_closed(self, spider, reason): """Handle the spider being finished.""" # TODO: Check reason for if it was finished, cancelled, or shutdown logging.debug('Spider is closing. Reason {0}'.format(reason)) self.store_stats() reactor.stop() def store_stats(self): """Stores scrapy scanning stats when scan is completed.""" logging.info('Stats: {0}'.format( self.scanner_spider.crawler.stats.get_stats())) try: statistics, created = Statistic.objects.get_or_create( scan=self.scanner.scan_object) except MultipleObjectsReturned: logging.error( 'Multiple statistics objects found for scan job {}'.format( self.scan_id)) if self.scanner_spider.crawler.stats.get_value( 'last_modified_check/pages_skipped'): statistics.files_skipped_count += self.scanner_spider.crawler.stats.get_value( 'last_modified_check/pages_skipped') if self.scanner_spider.crawler.stats.get_value( 'downloader/request_count'): statistics.files_scraped_count += self.scanner_spider.crawler.stats.get_value( 'downloader/request_count') if self.scanner_spider.crawler.stats.get_value( 'downloader/exception_type_count/builtins.IsADirectoryError'): statistics.files_is_dir_count += self.scanner_spider.crawler.stats.get_value( 'downloader/exception_type_count/builtins.IsADirectoryError') statistics.save() logging.debug('Statistic saved.') def handle_error(self, failure, response, spider): """Handle spider errors, updating scan status.""" logging.error("Scan failed: %s" % failure.getErrorMessage()) self.store_stats() scan_object = Scan.objects.get(pk=self.scan_id) scan_object.reason = failure.getErrorMessage() scan_object.save() def handle_idle(self, spider): """Handle when the spider is idle. Keep it open if there are still queue items to be processed. """ logging.debug("Spider Idle...") # Keep spider alive if there are still queue items to be processed remaining_queue_items = ConversionQueueItem.objects.filter( status__in=[ ConversionQueueItem.NEW, ConversionQueueItem.PROCESSING ], url__scan=self.scan_object).count() if remaining_queue_items > 0: logging.info( "Keeping spider alive: %d remaining queue items to process" % remaining_queue_items) raise DontCloseSpider else: logging.info("No more active processors, closing spider...")
import scrapy from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from ca_scraper.spiders.newspapers.contra_costa_times_spider import ContraCostaTimesSpider from ca_scraper.spiders.newspapers.la_times_spider import LATimesSpider from ca_scraper.spiders.newspapers.mercury_news_spider import MercuryNewsSpider from ca_scraper.spiders.newspapers.oc_register_spider import OCRegisterSpider from ca_scraper.spiders.newspapers.press_enterprise_spider import PressEnterpriseSpider from ca_scraper.spiders.newspapers.sacramento_bee_spider import SacramentoBeeSpider from ca_scraper.spiders.newspapers.san_diego_union_tribune_spider import SanDiegoUnionTribuneSpider from ca_scraper.spiders.newspapers.san_francisco_chronicle_spider import SanFranciscoChronicleSpider process = CrawlerProcess(get_project_settings()) process.crawl(ContraCostaTimesSpider) process.crawl(LATimesSpider) process.crawl(MercuryNewsSpider) process.crawl(OCRegisterSpider) process.crawl(PressEnterpriseSpider) process.crawl(SacramentoBeeSpider) process.crawl(SanDiegoUnionTribuneSpider) process.crawl(SanFranciscoChronicleSpider) process.start( ) # the script will block here until all crawling jobs are finished
def run_config(config): config = ConfigLoader(config) CustomMiddleware.driver = config.driver DocumentationSpider.NB_INDEXED = 0 if config.use_anchors: from . import scrapy_patch strategy = DefaultStrategy(config) algolia_helper = AlgoliaHelper( config.app_id, config.api_key, config.index_name, AlgoliaSettings.get(config, strategy.levels)) DOWNLOADER_MIDDLEWARES_PATH = 'scraper.src.custom_middleware.CustomMiddleware' DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.src.scrapy_patch.CustomContextFactory' if __name__ == '__main__': DOWNLOADER_MIDDLEWARES_PATH = 'src.custom_middleware.CustomMiddleware' DOWNLOADER_CLIENTCONTEXTFACTORY = 'src.scrapy_patch.CustomContextFactory' process = CrawlerProcess({ 'LOG_ENABLED': '1', 'LOG_LEVEL': 'ERROR', # 'LOG_LEVEL': 'DEBUG', 'USER_AGENT': config.user_agent, 'DOWNLOADER_MIDDLEWARES': { DOWNLOADER_MIDDLEWARES_PATH: 900 }, # Need to be > 600 to be after the redirectMiddleware 'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY }) process.crawl(DocumentationSpider, config=config, algolia_helper=algolia_helper, strategy=strategy) process.start() process.stop() # Kill browser if needed BrowserHandler.destroy(config.driver) if len(config.extra_records) > 0: algolia_helper.add_records(config.extra_records, "Extra records") if len(Camelizer.synonyms) > 0: algolia_helper.add_synonyms(Camelizer.synonyms) print("") if DocumentationSpider.NB_INDEXED > 0: algolia_helper.commit_tmp_index() print('Nb hits: ' + str(DocumentationSpider.NB_INDEXED)) config.update_nb_hits(DocumentationSpider.NB_INDEXED) else: print('Crawling issue: nbHits 0 for ' + config.index_name) algolia_helper.report_crawling_issue() print("")
def scrap_data(): process = CrawlerProcess(get_project_settings()) process.crawl('books') process.crawl('hozmart') process.start()
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from edu_parse.spiders.autoyoula import AutoyoulaSpider if __name__ == '__main__': crawler_settings = Settings() crawler_settings.setmodule("edu_parse.settings") crawler_proc= CrawlerProcess(settings=crawler_settings) crawler_proc.crawl(AutoyoulaSpider) crawler_proc.start() pass
def handle(self, *args, **options): process = CrawlerProcess(get_project_settings()) process.crawl(KalerkanthoSpider) process.start()
def foo(): process = CrawlerProcess() process.crawl(RFA_spider) process.start()
class Data_Spider: def __init__(self): self.process=CrawlerProcess(get_project_settings()) self.db=DBSession() self.init_seed_data() #设置默认值 # self.title_word=str(input('请输入学术讲座通知的匹配关键字:')) self.title = '报告题目:,学术报告:,题目,报告主题:,Title' #(默认值) self.speaker = '报告人:,主讲人:,汇报人:,Speaker,报告专家' self.venue = '地点:,Address,Venue,Place' self.time = '日期:,时间:,Time' self.title_word='' # 初始化seed表格数据 def init_seed_data(self): init=self.db.query(Seed).all() if len(init)==0: init_data=Seed() init_data.set_init_data(self.db) def set_college_url(self,college_url): # self.college_url=input('请输入需要爬取的学校的通知网址:') #start_url self.college_url =college_url def set_college(self,college): self.college=college def set_next_xpath(self,next_xpath): self.next_xpath=next_xpath def set_url_xpath(self,url_xpath): self.url_xpath=url_xpath def set_text_xpath(self,text_xpath): self.text_xpath=text_xpath #多个关键词用","隔开 def set_title_word(self): self.title_word='' def set_notify_time_xpath(self,notify_time_xpath): if len(notify_time_xpath)>0: self.notify_time_xpath=notify_time_xpath else: self.notify_time_xpath='' # 关键字设置,现已废除 # def set_title(self,title): # if len(title)>0: # self.title=self.title+','+title # self.title=self.title.replace(',',',') # def set_speaker(self,speaker): # if len(speaker)>0: # self.speaker=self.speaker+','+speaker # self.speaker=self.speaker.replace(',',',') # def set_venue(self,venue): # if len(venue)>0: # self.venue=self.venue+','+venue # self.venue = self.venue.replace(',', ',') # def set_time(self,time): # if len(time)>0: # self.time=self.time+','+time # self.time = self.time.replace(',', ',') # def insert_seed(self,college_url): # def insert_seed_test(self): # self.insert_seed() def insert_seed(self,db): # college_url=str(input('请输入需要爬取的学校的通知网址:')) # 设置图形化界面后忽略这一部分 # self.set_college_url(college_url) # college = str(input('请输入需要爬取的学校(学院)的名称:')) # self.set_college(college) # next_xpath=str(input('请输入通知网站下一页的xpath选择器路径:')) # self.set_next_xpath(next_xpath) # url_xpath=str(input('请输入通知网站下每个具体网站超链接的xpath路径:')) # self.set_url_xpath(url_xpath) # text_xpath=str(input('请输入具体通知页面下,爬取通知正文每行文字的xpath路径:')) # self.set_text_xpath(text_xpath) # notify_time_xpath=str(input('请输入具体通知页面下,爬取通知时间的xpath路径,默认为空(不存在就不必输入):')) # self.set_notify_time_xpath(notify_time_xpath) # #上述五条信息必须输入,下面的信息可以选择性输入 # title_word=str(input('请输入总通知页面下通知标题的字符匹配规则:(可选择不输入)')) # self.title_word=title_word # title=str(input('请输入报告标题的字符匹配规则:(可选择不输入)')) # self.set_title(title) # speaker = str(input('请输入报告人的字符匹配规则:(可选择不输入)')) # self.set_speaker(speaker) # venue = str(input('请输入报告地点的字符匹配规则:(可选择不输入)')) # self.set_venue(venue) # time = str(input('请输入报告时间的字符匹配规则:(可选择不输入)')) # self.set_time(time) try: seed=Seed(start_url= self.college_url,college= self.college,url_xpath= self.url_xpath, nextpage_xpath= self.next_xpath,title_word= self.title_word,notice_time_xpath= self.notify_time_xpath, # title=self.title, speaker=self.speaker, venue=self.venue, time=self.time, text_xpath= self.text_xpath) db.add(seed) db.commit() except Exception as e: print(e) db.rollback() print('插入数据失败') #单个指定学校爬取 def get_existed_urls(self,seed): existed_urls = [] urls = self.db.query(Notification.url).filter(seed.college == Notification.college).all() # existed_urls=[] if len(urls)>0: for url in urls: existed_urls.append(url[0]) return existed_urls #爬取学校学术信息通用流程 def common_spider(self,seed): urlHandle=UrlHandle() existed_urls=self.get_existed_urls(seed) urlHandle.set_start_url(seed.start_url) urlHandle.set_title_word(seed.title_word) urlHandle.set_existed_urls(existed_urls) urlHandle.set_nextpage_xpath(seed.nextpage_xpath) urlHandle.set_url_xpath(seed.url_xpath) title_urls=urlHandle.get_filte_urls() selenium_spider = SeleniumSpider(seed, title_urls) selenium_spider.start_selenium() # self.process.crawl(NoticeSpider,seed,title_urls) # self.process.start() #单个学校学术信息爬取 def university_spider(self,seed): # college_url=self.set_college_url() # seed = self.db.query(Seed).filter(Seed.start_url == college_url).one() if seed.start_url=='https://iiis.tsinghua.edu.cn/zh/seminars/': #清华大学 self.process.crawl(ThuIiisSpider) self.process.start() else: self.common_spider(seed) # 所有学校学术信息爬取,一次性爬取所有学校会出错 def universities_spider(self): seeds=self.db.query(Seed).all() for seed in seeds: #对于每个学校直接调用单个学校爬取函数 self.university_spider(seed) # def start_spider(self): # is_one_spider=str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n')) # while True: # print(is_one_spider) # if is_one_spider in ['y','Y','yes','Yes']: # college_url = str(input('请输入需要爬取的学校的通知网址:')) # seed = self.db.query(Seed).filter(Seed.start_url == college_url).all() # if len(seed)==0: # seed=self.insert_seed(college_url) # self.university_spider(seed) # else: # self.university_spider(seed[0]) # is_continue=str(input(('爬取完成,是否继续?y/n'))) # if is_continue in ['y','Y','yes','Yes']: # is_one_spider = str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n')) # else: # break # elif is_one_spider in ['n','no','No','N']: # self.universities_spider() # print('所有信息爬取完成!') # break # else: # print('你的输入错误,请重新输入:') # is_one_spider=str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n')) #放在主程序执行 # spider=Data_Spider() # spider.start_spider() #请输入需要爬取的学校的通知网址:http://sist.swjtu.edu.cn/list.do?action=news&navId=40 # 请输入需要爬取的学校(学院)的名称:西南交通大学信息科学与技术学院 # 请输入通知网站下一页的xpath选择器路径://div[@class="tableFootLeft"]//a[text()="下一页"] # 请输入通知网站下每个具体网站超链接的xpath路径://*[@id="rightPageContent"]/dl//dd # 请输入具体通知页面下,爬取通知正文每行文字的xpath路径://*[@id="newsBody"] # 请输入具体通知页面下,爬取通知时间的xpath路径,默认为空(不存在就不必输入)://*[@id="newsInfo"] # http://cs.gzu.edu.cn/forum.php?mod=forumdisplay&fid=57&page=1 # 贵州大学计算机科学与技术学院 # url_xpath=//*[@id="newsList"]//p # nextpage=//*[@id="bmbw0pgscl"]/div//a[text()='下一页'] # notify_time=//*[@id="ct"]/div[1]/div/div[1]/p # 通知全文=//td[@class="t_f"]
import json class MainSpider(scrapy.Spider): name = 'main' # allowed_domains = ['longandfoster.com'] start_urls = ['https://www.longandfoster.com/include/ajax/api.aspx?op=SearchAgents&firstname=&lastname=&page=1&pagesize=200'] def parse(self, response): resp = json.loads(json.loads(response.body)['Entity']) for each in resp: name = each.get('DisplayName') yield { "Name": name, } # --- run without project and save in `output.csv` --- from scrapy.crawler import CrawlerProcess c = CrawlerProcess({ 'USER_AGENT': 'Mozilla/5.0', # save in file CSV, JSON or XML 'FEED_FORMAT': 'csv', # csv, json, xml 'FEED_URI': 'output.csv', # }) c.crawl(MainSpider) c.start()
def handle(self, *args, **options): process = CrawlerProcess(get_project_settings()) process.crawl(HemnetSpider.HemnetSpider) process.start()
def run_config(config): config = ConfigLoader(config) CustomDownloaderMiddleware.driver = config.driver DocumentationSpider.NB_INDEXED = 0 strategy = DefaultStrategy(config) meilisearch_helper = MeiliSearchHelper(config.app_id, config.api_key, config.index_uid, config.custom_settings) root_module = 'src.' if __name__ == '__main__' else 'scraper.src.' DOWNLOADER_MIDDLEWARES_PATH = root_module + 'custom_downloader_middleware.' + CustomDownloaderMiddleware.__name__ DUPEFILTER_CLASS_PATH = root_module + 'custom_dupefilter.' + CustomDupeFilter.__name__ headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en", } # Defaults for scrapy https://docs.scrapy.org/en/latest/topics/settings.html#default-request-headers if os.getenv("CF_ACCESS_CLIENT_ID") and os.getenv( "CF_ACCESS_CLIENT_SECRET"): headers.update({ "CF-Access-Client-Id": os.getenv("CF_ACCESS_CLIENT_ID"), "CF-Access-Client-Secret": os.getenv("CF_ACCESS_CLIENT_SECRET"), }) elif os.getenv("IAP_AUTH_CLIENT_ID") and os.getenv( "IAP_AUTH_SERVICE_ACCOUNT_JSON"): iap_token = IAPAuth( client_id=os.getenv("IAP_AUTH_CLIENT_ID"), service_account_secret_dict=json.loads( os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON")), )(requests.Request()).headers["Authorization"] headers.update({"Authorization": iap_token}) elif os.getenv("KC_URL") and os.getenv("KC_REALM") and os.getenv( "KC_CLIENT_ID") and os.getenv("KC_CLIENT_SECRET"): realm = KeycloakRealm(server_url=os.getenv("KC_URL"), realm_name=os.getenv("KC_REALM")) oidc_client = realm.open_id_connect( client_id=os.getenv("KC_CLIENT_ID"), client_secret=os.getenv("KC_CLIENT_SECRET")) token_response = oidc_client.client_credentials() token = token_response["access_token"] headers.update({"Authorization": 'bearer ' + token}) DEFAULT_REQUEST_HEADERS = headers process = CrawlerProcess({ 'LOG_ENABLED': '1', 'LOG_LEVEL': 'ERROR', 'USER_AGENT': config.user_agent, 'DOWNLOADER_MIDDLEWARES': { DOWNLOADER_MIDDLEWARES_PATH: 900 }, # Need to be > 600 to be after the redirectMiddleware 'DUPEFILTER_USE_ANCHORS': config.use_anchors, # Use our custom dupefilter in order to be scheme agnostic regarding link provided 'DUPEFILTER_CLASS': DUPEFILTER_CLASS_PATH, 'DEFAULT_REQUEST_HEADERS': DEFAULT_REQUEST_HEADERS, }) process.crawl(DocumentationSpider, config=config, meilisearch_helper=meilisearch_helper, strategy=strategy) process.start() process.stop() # Kill browser if needed BrowserHandler.destroy(config.driver) if len(config.extra_records) > 0: meilisearch_helper.add_records(config.extra_records, "Extra records", False) print("") if DocumentationSpider.NB_INDEXED > 0: # meilisearch_helper.commit_tmp_index() print('Nb hits: {}'.format(DocumentationSpider.NB_INDEXED)) else: print('Crawling issue: nbHits 0 for ' + config.index_uid) # meilisearch_helper.report_crawling_issue() sys.exit(EXIT_CODE_NO_RECORD) print("")
def validate(): process = CrawlerProcess(settings=settings) process.crawl(ValidatorSpider) process.start()
def main(): process = CrawlerProcess(settings=get_project_settings()) process.crawl(OpenlibraryLoginSpider) process.start()
from scrapy.crawler import Crawler, CrawlerProcess from scraper import IMDbTop1000Spider from index import IMDbIndex from flask import Flask, request app = Flask(__name__) # Crawl the service using our spider and store it in a list movies = [] def collect_items(item, response, spider): movies.append(item) crawler = Crawler(IMDbTop1000Spider) crawler.signals.connect(collect_items, signals.item_scraped) process = CrawlerProcess() process.crawl(crawler) process.start() # block until finished # Index this data to Whoosh imdb_index = IMDbIndex() imdb_index.bulk_index(movies) @app.route('/search', methods=['GET']) def index(): search_term = request.args.get('q', type=str) return imdb_index.search(search_term)
} stock = VietstockItem() stock['date'] = items['date'] stock['time'] = items['time'] stock['stock_name'] = items['stock_name'] stock['price'] = items['price'] yield stock def run_crawl(): runner = CrawlerRunner({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', }) deferred = runner.crawl(StockSpider) # you can use reactor.callLater or task.deferLater to schedule a function deferred.addCallback(reactor.callLater, 5, run_crawl) return deferred if __name__ == "__main__": process = CrawlerProcess(get_project_settings()) scheduler = TwistedScheduler() scheduler.add_job(process.crawl, 'interval', args=[StockSpider], seconds=10) scheduler.start() process.start(False)