def run(): config.connect_to_client() print('Running event processor...') crawlerProcess = CrawlerProcess(get_project_settings()) settings = project.get_project_settings() spider_loader = spiderloader.SpiderLoader.from_settings(settings) spiders = spider_loader.list() classes = [ s for s in (spider_loader.load(name) for name in spiders if config.spider_name == None or name == config.spider_name) if s.enabled ] crawlerProcess = CrawlerProcess(get_project_settings()) for spider_class in classes: crawlerProcess.crawl(spider_class) crawlerProcess.start() crawlerProcess.join() print('Event processor completed') events = requests.get(config.get_events, params={}) if len(events.json()) > 0: print('Data retrieved successfully') else: print('No data retrieved')
def main(): settings = get_project_settings() process = CrawlerProcess(settings) process.crawl(IndiatimesSpider) process.start() process.join()
def run(self): # 固定用run方法,启动进程自动调用run方法 print("启动前台抓取任务") settings = get_project_settings() settings.set( 'USER_AGENT', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/70.0.3538.77 Safari/537.36") settings.set('LOG_FILE', self.name + ".log") settings.set('ROBOTSTXT_OBEY', False) process = CrawlerProcess(settings) process.crawl(QuotesSpider, shop_name=self.name) process.start() process.join() print("前台抓取数据一轮完成") count = random.randint(10, 30) database = DataManager(self.name) attr = database.getAttr("EMPTY") if attr["minute"] != 0: count = attr["minute"] * 60 minute = 0 while minute <= count: database.handlerStatus() minute += 1 time.sleep(1) attr = database.getAttr("EMPTY") count = attr["minute"] * 60
def run(): status, msg = config.connect_to_client() if not status: print(msg) sys.exit(1) # Look for one month of events for testing purposes start_date = datetime.now().strftime('%m-%d-%Y') end_date = (datetime.now() + relativedelta(months=+1)).strftime('%m-%d-%Y') print('Running event processor...') crawlerProcess = CrawlerProcess(get_project_settings()) crawlerProcess.crawl(HistorySpider, start_date, end_date) crawlerProcess.crawl(WpbccSpider, start_date, end_date) crawlerProcess.crawl(LWVChicago, start_date, end_date) crawlerProcess.crawl(LibraryEvents, start_date, end_date) crawlerProcess.crawl(GreatLakesReader, start_date, end_date) crawlerProcess.start() crawlerProcess.join() print('Event processor completed') events = requests.get(config.db_get_events, params = { 'start_timestamp': 0, 'end_timestamp': 10000000000 }) if len(events.json()) > 0: print('Data retrieved successfully') else: print('No data retrieved')
def a(): settings = get_project_settings() settings.set('ITEM_PIPELINES', {'pipl.MoviePipeline': 100}) crawler = CrawlerProcess(settings) crawler.crawl(MeijuSpider) crawler.start() crawler.join()
def handleDatas(self, oriurl, orilable, urllists, delete_urllists): """ 当点击下载时候,获取所添加的URL,启动爬虫开始下载 :return: """ self.urllists = urllists if delete_urllists != []: # p_delete = Process(target=self.deleteOldDatas,args=(delete_urllists,)) # p_delete.start() # p_delete.join() # p_delete = threading.Thread(target=self.deleteOldDatas, args=(delete_urllists,)) # p_delete.start() # p_delete.join() self.deleteOldDatas(delete_urllists) aim_lables = [] for item in urllists: aim_lables.append(orilable[oriurl.index(item)]) # 这里执行插入语句,将新的url+lable数据插入到details数据表 threading.Thread(target=self.insertUrlLableIntoSQL, args=(urllists, aim_lables)).start() urllists = ','.join(urllists) # cmdline.execute(["scrapy", "crawl", "tmallMain","-a","url_lists="+urllists]) process = CrawlerProcess(get_project_settings()) # 'credit'替换成你自己的爬虫名 process.crawl('tmallMain', url_lists=urllists) process.start( ) # the script will block here until the crawling is finished process.join()
def run(): settings = get_project_settings() crawler = CrawlerProcess(settings) crawler.crawl(RyanscomputersSpider) crawler.crawl(StartechSpider) crawler.start() crawler.join()
def crawl_run(): print('开始爬取............') scope = 'all' process = CrawlerProcess(settings=get_project_settings()) process.crawl(SentispiderSpider, scope) process.start() process.join() print('爬取结束............')
def crawl_policy_watch(): """ Starts crawling process which fetches government policies from website covid19policywatch.org""" settings = Settings() process = CrawlerProcess(settings) process.crawl(PolicyWatchSpider) process.start() process.join()
def run(self): """ Starting client and scrapping jobs. And then get results from scrapping (url list of images) and start processes in pool for downloading and storing non-duplicate images. :return: self """ if self.hashes is None: logging.error(f'prepare() function was not called before') return None # results = [] queue = multiprocessing.Queue() pool = [ multiprocessing.Process(target=self._queue_worker, args=(queue, )) for _ in range(self.num_processes) ] for process in pool: process.start() # pool = multiprocessing.Pool(self.num_processes, self._worker_main, (queue,)) def crawler_results(signal, sender, item, response, spider): """ help function for getting result when one page scrapped :param signal: :param sender: :param item: :param response: :param spider: :return: """ # results.append(item) for x in item['urls']: queue.put(x) dispatcher.connect(crawler_results, signal=signals.item_passed) process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(WallpapersSpider, start_time=self.start_time, end_time=self.end_time, resolution=self.resolution, start_url=self.BASE_URL) logging.getLogger('scrapy').setLevel(logging.ERROR) process.start() for _ in range(self.num_processes): queue.put('STOP') # results = [x for res in results for x in res['urls']] # logging.info(f'ALL IMAGES URLS: {", ".join(results)}') # with multiprocessing.Pool(self.num_processes) as pool: # pool.map(self._process_urls, results) for process in pool: process.join() return self
def run_scrapy(spider): """ 利用scrapy爬虫框架启动weahter爬虫抓取从2011-2019年所有月份的天气数据信息,并存放到MangoDB数据库中 """ project_settings = get_project_settings() process = CrawlerProcess(project_settings) process.crawl(spider) process.start() process.join() # 进程间同步
def main(*, query_type, url): file_path = _get_file_path(query_type) process = CrawlerProcess({ **get_project_settings(), "FEED_URI": file_path, "FEED_FORMAT": "csv" }) process.crawl('immoscout', url=url) process.start() process.join() _post_process(query_type, suffix="csv")
def ensure_msig_path() -> str: """Download the GSEA data and return the path.""" if not os.path.exists(GMT_ENTREZ_PATH): process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', }) process.crawl(GSEASpider) process.start() process.join() return GMT_ENTREZ_PATH
def runSpider(uid): print(uid) dir_name = './result/' + str(uid) if not os.path.isdir(dir_name): os.makedirs(dir_name) process = CrawlerProcess(get_project_settings()) print(get_project_settings()) # name = ['weibocn'] process.crawl('weibocn', uid) process.start() process.join()
def crawl_lad(depth=lad_depth, urls=None, domain=lad_domain): """Starts crawling process which downloads pdfs from all prepared .gov websites""" if urls is None: urls = list(get_gov_websites(gov_sites_path)) settings = scrapy_settings(depth, concurrent_requests) process = CrawlerProcess(settings) process.crawl(LadSpider, urls, domain) process.start() process.join()
def crawl(self): """ crawl through the database and either save the results to a database or text files. """ # setup settings from scrapy.settings import Settings from scrapytest.spiders import GuardianNewsSpider settings = Settings() settings.set("USER_AGENT", config['crawler_user_agent']) settings.set("LOG_LEVEL", self._args['log_level']) settings.set('custom_guardian_config', self._custom_guardian_config) crawler = CrawlerProcess(settings=settings) crawler.crawl(GuardianNewsSpider) crawler.start() crawler.join()
def main(): if len(sys.argv) != 2: print('usage: run_spider.py file-config') sys.exit(1) file_config = sys.argv[1] if not os.path.exists(file_config): print(f'Not found: {file_config}') sys.exit(1) configure_logging() settings = get_project_settings() _murlok = Murlok(file_config) formats = ('json', 'xml', 'jsonlines', 'csv', 'pickle', 'marshal') settings.set('FEED_EXPORT_ENCODING',str(_murlok.encoding)) if _murlok.format in formats: settings.set('FEED_FORMAT',str(_murlok.format)) settings.set('FEED_URI',(_murlok.spider) + '.' + str(_murlok.format)) else: #Pippeline config peewee - pip install -U peewee #format mysql, postg, sqlite settings.set('ITEM_PIPELINES','{"murlok.pipelines.MurlokPipeline": 300}') pass runner = CrawlerProcess(settings) runner.crawl(MurlokSpider,murlok = _murlok) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def main(args): settings = Settings() settings.setmodule(iw_settings) spider = ThedySpider() process = CrawlerProcess(settings) crawler = process.create_crawler(spider) crawler.signals.connect(item_scraped, signal=signals.item_scraped) process.crawl(crawler) process.start(stop_after_crawl=True) process.join() result["scraping_time"] = result["scraping_time"].isoformat() doc = {"doc": dict(result)} return doc
def runAllCities(cityPairs, days): a = time.time() process = CrawlerProcess(get_project_settings()) for pair in cityPairs: process.crawl(SWAFareSpider, fromCity = pair[0], days = days, toCity = pair[1]) d = process.join() d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until all crawling jobs are finished print("crawl time: " + str(time.time() - a))
def main(argv=None): """Entry point to the anime planet link scraper.""" argv = argv or sys.argv[1:] parser = argparse.ArgumentParser("""Scrape anime character profiles.""") parser.add_argument("--manifest", metavar="OUTPUT", type=str, default=None, required=False) parser.add_argument("--pages-directory", metavar="PAGES", type=str, required=True) result = parser.parse_args(argv) # maybe get the previous manifest entries (to write back out into the new # manifest). if result.manifest and os.path.exists(result.manifest): with open(result.manifest, "r") as fileobj: previous_manifest = json.load(fileobj) else: previous_manifest = [] with open(result.manifest or sys.stdout, "w") as manifest_fileobj, \ JSONListStream(manifest_fileobj) as json_stream: previously_scraped_urls = set() for item in previous_manifest: json_stream.write(item) previously_scraped_urls.add(item["url"]) for entry in os.scandir(result.pages_directory): filename = os.path.basename(entry.path) b64_url, ext = filename.split(".") if ext != "html": continue previously_scraped_urls.add(base64_urldecode(b64_url)) spider_cls = \ make_anime_planet_spider_cls(previously_scraped_urls) process = CrawlerProcess({"COOKIES_ENABLED": False}) process.crawl(spider_cls, manifest_file=json_stream, pages_directory=result.pages_directory, previously_scraped_urls=previously_scraped_urls) process.start() process.join()
def runUserFlights(userFlights): a = time.time() process = CrawlerProcess(get_project_settings()) for flight in userFlights: if flight.date > datetime.now(): #check in timezone of flight.. process.crawl(SWAFareSpider, fromCity = flight.origin, days = 1, toCity = flight.destination, startDate = flight.date) d = process.join() d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until all crawling jobs are finished print("crawl time: " + str(time.time() - a))
def run_all_spiders(): process_default = CrawlerProcess() # We assume that the default spider has to run first and finish before the other spiders run. process_default.crawl(DefaultQuotesSpider) process_default.start() time.sleep(120) # process_default.stop() process = CrawlerProcess() process.join() active_spiders = [ TableQuotesSpider, JavascriptQuotesSpider, LoginQuotesSpider, InfiniteScrollQuotesSpider, ] for spider in active_spiders: process.crawl(spider) process.start()
def _crawl(self, spider, qis_running): write_in_a_file('CrawlerProcess.signal.error', {'signals': dir(signals)}, 't.txt') qis_running.put(spider) crawler = CrawlerProcess(get_project_settings()) crawler.crawl(spider) # To prevent the infamous error: django.db.utils.InterfaceError: (0, '') db.connection.close() crawler.start() write_in_a_file('SpiderProcess.start: process started', {}, 'debug.txt') crawler.join() write_in_a_file('SpiderProcess.crawl: process joined', {}, 'task.txt') write_in_a_file('SpiderProcess.crawl: process joined', {}, 'tasks.txt') write_in_a_file('SpiderProcess.crawl: process joined', {}, 'spider.txt') write_in_a_file(f'Crawler Process - before: qis_running.qsize: {qis_running.qsize()}', {}, 'tasks.txt') try: qis_running.get() except Exception as e: write_in_a_file(f'Crawler Process - error in qis_running.get: {e}', {}, 'tasks.txt') write_in_a_file(f'Crawler Process - after: qis_running.qsize: {qis_running.qsize()}', {}, 'tasks.txt') write_in_a_file('===========================================================================================', {}, 'tasks.txt')
def main(args): settings = Settings() settings.setmodule(iw_settings) spider = ThedySpider() process = CrawlerProcess(settings) crawler = process.create_crawler(spider) crawler.signals.connect(item_scraped, signal=signals.item_scraped) process.crawl(crawler) process.start(stop_after_crawl=True) process.join() result["scraping_time"] = result["scraping_time"].isoformat() doc = { "doc": dict(result) } return doc
class RunCrawler(object): """ RunCrawler class """ def __init__(self): self.running = False self.process = None def start(self): """ 开始 """ if self.running: return self.running = True # 获取通用项目设置 self.process = CrawlerProcess(get_project_settings()) # 根据规则启动爬虫 proxyRules = ProxyRules() for (k, v) in proxyRules.Rules.iteritems(): if isinstance(v, dict): if 'enable' in v and v['enable']: logger.info('Start crawl name:%(name)s rule:%(rule)s', { 'name': v['name'], 'rule': k }) self.process.crawl(CommonSpider, v) # the script will block here until the crawling is finished self.process.start() self.process.join() self.process.stop() self.running = False def stop(self): """ 停止 """ if self.running: self.running = False self.process.stop()
def main(): ''' Starts harvest script ''' logger.info('launching main') # tz = pytz.timezone('America/Los_Angeles') tz = pytz.timezone('EST') start = datetime.datetime.now(tz=tz) if config['use_proxy']: update_proxies() # launching crawlers store = Store() process = CrawlerProcess(custom_settings) for spider in spiders: logger.info(f'starting {spider.name}') process.crawl(spider, store=store) process.start() process.join() end = datetime.datetime.now(tz=tz) logger.info(f"runtime: {end - start}")
class CrawlerExecutor(): count = 0 def __init__(self, spider): def increment_count(cls): print('incrementing count') cls.count = cls.count + 1 dispatcher.connect(lambda _: print('FINIsh'), signal=signals.spider_closed) dispatcher.connect(increment_count, signal=signals.item_passed) settings = get_project_settings() self.process = CrawlerProcess(settings) self.spider = spider def start(self): self.process.crawl(self.spider) self.process.start() def join(self): self.process.join()
def getDomainUrls(startUrl, allowedDomains, invalidStrings): pool = Urllib3PoolFactory.getPool() request = pool.request('GET', startUrl + '/robots.txt') disallowUris = [] try: for line in request.data.split('\n'): if 'disallow' in line.lower(): disallowUris.append(line.lower().split(':')[1].strip()) except: pass process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'LOG_ENABLED': True, }) ScrapyCrawler.configure([startUrl], allowedDomains, invalidStrings, disallowUris) process.crawl(ScrapyCrawler) process.start() process.join() return ScrapyCrawler.domainUrls
os.chdir('data_aggregators') # Look for one month of events for testing purposes start_date = datetime.now().strftime('%m-%d-%Y') end_date = (datetime.now() + relativedelta(months=+1)).strftime('%m-%d-%Y') print('Running data engine...') crawlerProcess = CrawlerProcess(get_project_settings()) crawlerProcess.crawl(HistorySpider, start_date, end_date) crawlerProcess.crawl(WpbccSpider, start_date, end_date) crawlerProcess.crawl(LWVchicago, start_date, end_date) crawlerProcess.crawl(LibraryEvents, start_date, end_date) crawlerProcess.crawl(GreatLakesReader, start_date, end_date) crawlerProcess.start() crawlerProcess.join() print('Data engine complete') events = requests.get(config.db_get_events, params={ 'start_timestamp': 0, 'end_timestamp': 10000000000 }) if len(events.json()) > 0: print('Data retrieved successfully') else: print('No data retrieved')
''' import logging import gevent from scrapy import signals from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from tddc import Singleton, TaskManager, Task, TaskCacheManager, TaskConfigModel, DBSession from .Scrapy import SingleSpider settings = get_project_settings() crawler_process = CrawlerProcess(settings) crawler_process.join() log = logging.getLogger(__name__) class Crawler(object): ''' Spider管理、任务分配 ''' __metaclass__ = Singleton def __init__(self): ''' Constructor ''' log.info('Spider Is Starting.')
# settings.set('LOG_LEVEL', 'ERROR') # pipeline 설정 settings.setdict({ 'ITEM_PIPELINES': { # 'shopper.pipelines.CsvPipeline': 300, # 크롤링결과를 csv 파일로 export 'shopper.pipelines.JsonPipeline': 300, # 크롤링결과를 json 파일로 export # 'shopper.pipelines.CrawlNewPipeline': 300, # DB Import } }) # 슬랙 연동 settings.setdict( {'EXTENSIONS': { 'shopper.middleware.slack_middleware.SlackStats': 100, }}) slack = SlackSum(settings.get("SLACK_TOKEN"), settings.get("SLACK_CHANNEL"), settings.get("SLACK_BOT")) process = CrawlerProcess(settings) # process.crawl(ChanelSpider) # process.crawl(YslSpider) # process.crawl(LouisVuittonSpider) # process.crawl(GucciSpider) process.crawl(HermesSpider) process.start() process.join() # 크롤링 결과 데이터 보내는 # slack.total_finish()
import scrapy from twisted.internet import reactor from scrapy.crawler import CrawlerProcess from scrapy.utils.log import configure_logging from scrapy.utils.project import get_project_settings configure_logging() runner = CrawlerProcess(get_project_settings()) runner.crawl(AbcdinSpider) runner.crawl(CasaximenaSpider) runner.crawl(CoronaSpider) runner.crawl(DafitiSpider) runner.crawl(EasySpider) runner.crawl(FalabellaSpider) runner.crawl(HitesSpider) runner.crawl(LapolarSpider) runner.crawl(LinioSpider) runner.crawl(ParisSpider) runner.crawl(PcfactorySpider) runner.crawl(RipleySpider) runner.crawl(SodimacSpider) runner.crawl(ZmartSpider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until all crawling jobs are finished