def _crawl(path=None): crawl = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) crawl.crawl(ProvinceSpider) crawl.start() crawl.stop()
def runSpiderProcess(spider_cls, *args, **kwargs): """ Helper method that starts a spider with the given init arguments, waits for it to complete, and returns the items it yielded in a list. :param spider_cls: the spider class to run :param args: the indexed arguments to the spider :param kwargs: the keyword arguments to the spider :return: a list of items yielded by the spider """ process = CrawlerProcess() process.crawl(spider_cls, *args, **kwargs) final_result = [] def _nab_item(item): # FIXME: this silly dance of encoding and decoding is to prevent scrapy items from being returned to celery # FIXME: celery can't serialize them, so it throws a rather opaque error, but it's fine with lists and dicts final_result.append(json.loads(scrapy_encoder.encode(item))) for crawler in process.crawlers: crawler.signals.connect(_nab_item, item_scraped) process.start() process.stop() return final_result
class MySpiderProcess1(scrapy.Spider): def __init__(self, name, urls): self.name = name self.start_urls = urls scrapy.Spider.__init__(self) def parse(self, response): print('parse response') def _crawl(self): settings = Settings() settings.set('ITEM_PIPELINES', { 'app.pipelines.JsonWriterPipeline': 300 }) self.process = CrawlerProcess(settings) self.process.crawl(self, self.name, self.start_urls) self.process.start() # self.process.stop() # self.process.join() def start(self): p = Process(target=self._crawl) p.start() p.join() # # def start(self): # self._crawl() def stop(self): self.process.stop()
def main(): """Index alexa demographics """ engine = db_connect() Session = sessionmaker(bind=engine) session = Session() settings = get_project_settings() settings.set('ITEM_PIPELINES', {'demographic_scraper.demographic_scraper.pipelines.WebsiteDemographicPipeline': 300}) settings.set('EXTENSIONS', {'scrapy.telnet.TelnetConsole': None,}) process = CrawlerProcess(settings) for website in session.query(WebsitesContent).all(): demographic = list(session.query(Websites).filter_by(link=website.link)) if len(demographic) is 0: url = website.link print website.link AlexaSpider.name = url process.crawl(AlexaSpider, url=url, db_session=session) process.start() process.stop() session.close()
class CrawlerScript(): def __init__(self): self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def _crawl(self, queue, spider_name): spider = self.crawler.spiders.create(spider_name) if spider: self.crawler.queue.append_spider(spider) self.crawler.start() self.crawler.stop() queue.put(self.items) def crawl(self, spider): queue = Queue() p = Process(target=self._crawl, args=(queue, spider,)) p.start() p.join() return queue.get(True)
def startSpiderTest(group_type,spider_type,spider_group_name,spider_name): #调用Scrapy内部方法 settings = get_project_settings() #实例化一个爬虫进程 crawlerProcess = CrawlerProcess(settings) #创建一个爬虫,一个爬取处理器可以,运行多个爬取。 crawler = crawlerProcess.create_crawler(spider_name) #设置爬虫的状态。 当爬虫发出该信号后,调用响应的方法。 crawler.signals.connect(spiderSignal.startSingnal, signals.spider_opened) crawler.signals.connect(spiderSignal.errorSingnal, signals.spider_error) crawler.signals.connect(spiderSignal.stopSingnal, signals.spider_closed) #获取爬取类 spiderConf = Spider_Dict[group_type][spider_type] spiderArgs = spiderConf[1].copy() spiderArgs["name"] = spider_name spiderArgs["redis_key"] = spider_name spiderArgs["spider_type"] = spider_type spiderArgs["spider_group_name"] = spider_group_name spiderArgs["task_id"] = "-1" spider = spiderConf[0](**spiderArgs) #给爬虫设置爬取类 crawler.configure() crawler.crawl(spider) #爬虫启动。 crawlerProcess.start() crawlerProcess.stop()
class CrawlerScript(): def __init__(self): settings = get_project_settings() settings.set('LOG_ENABLED', False, priority='cmdline') #settings.overrides['LOG_ENABLED'] = False self.crawler = CrawlerProcess(settings) self.items = [] SignalManager(dispatcher.Any).connect(self._item_passed, signal=signals.item_scraped) def _item_passed(self,item,response,spider): self.items.append(item) def _crawl(self, q, queue): self.crawler.crawl(BingSpider, q=q) self.crawler.start() self.crawler.stop() queue.put(self.items) def crawl(self, q): queue = Queue() p = Process(target=self._crawl, args=[q, queue]) p.start() p.join() return queue.get(True)
def main(): repos = read_yaml(CONFIG_PATH) # FIXME: obtain last commit? rules = read_yaml(RULES_PATH) # FIXME: obtain last commit? from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings process = CrawlerProcess(get_project_settings()) for rule in rules: policies_path = create_policy_file_path(rule) process.crawl( 'policies', policies_path=policies_path, url=rule['url'], xpath=rule['xpath']) logger.debug('starting crawler') # the script will block here until the crawling is finished process.start() process.stop() for repo in repos: commit(DATA_REPO_PATH, repo['url'], repo['name'])
def scrapeando(): from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): """Rellenamos la BD""" for i in enumerate(item.items()): x = i[0] query = "INSERT INTO book (Nombre ,Autor, Editorial ,Fecha, Precio, Link) VALUES ("+decodifica(item['Nombre'][x])+","+decodifica(item['Autor'][x])+","+decodifica(item['Editorial'][x])+","+decodifica(item['Fecha'][x])+","+decodifica(item['Precio'][x])+","+decodifica("http://www.casadellibro.com"+item['Link'][x])+");" db.micursor.execute(query) db.conexion.commit() print item dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() book = BookSpider() book.busqueda=unicode(search.getbusqueda()) crawler.crawl(book) print "Start scraping to la Casa del Libro" crawler.start() print "End scraping to la Casa del Libro" crawler.stop()
class CrawlerWorker(multiprocessing.Process): def __init__(self, spider, result_queue): multiprocessing.Process.__init__(self) self.result_queue = result_queue self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) #__init__ def _item_passed(self, item): self.items.append(item) # _item_passed def run(self): self.crawler.crawl(self.spider) self.crawler.start() self.crawler.stop() self.result_queue.put(self.items) #run
def _crawl(path): from scrapy.crawler import CrawlerProcess crawl = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'LOG_FILE': 'text.log', 'LOG_LEVEL': 'INFO' }) crawl.crawl(ProvinceSpider) crawl.start() crawl.stop()
def main(): repos = [] repos_conf = obtain_yaml(CONFIG_REPO_PATH, CONFIG_REPO_URL, CONFIG_REPO_BRANCH, CONFIG_PATH) rules = obtain_yaml(RULES_REPO_PATH, RULES_REPO_URL, RULES_REPO_BRANCH) write_ssh_keys(SSH_DIR, MORPH_SSH_PRIV_KEY_ENV, MORPH_SSH_PUB_KEY_ENV, SSH_PRIV_KEY_PATH, SSH_PUB_KEY_PATH) if ismorpio(): write_ssh_command(GIT_SSH_COMMAND_PATH, GIT_SSH_COMMAND_MORPHIO) else: write_ssh_command(GIT_SSH_COMMAND_PATH, GIT_SSH_COMMAND) write_ssh_key_server(GITHUB_SSH_PUB_KEY, SSH_PUB_KEY_SERVER_PATH) for repo_conf in repos_conf: logger.debug('repo name %s' % repo_conf.get('name')) repo = pull_or_clone(DATA_REPO_PATH, repo_conf.get('url'), DATA_REPO_BRANCH, repo_conf.get('name'), GIT_SSH_COMMAND_PATH, False) repos.append(repo) from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings process = CrawlerProcess(get_project_settings()) for rule in rules: policies_path = create_data_file_path(rule, DATA_REPO_PATH) process.crawl( 'policies', policies_path=policies_path, url=rule['url'], xpath=rule['xpath']) logger.debug('starting crawler') # the script will block here until the crawling is finished process.start() process.stop() for repo in repos: commit_push_if_changes(repo, GIT_AUTHOR_NAME, GIT_AUTHOR_EMAIL, GIT_SSH_COMMAND_PATH, DATA_REPO_BRANCH, METADATA_PATH) # logger.debug('CHECKING SSH KEYS') # logger.debug('===================') # for repo in repos: # check_ssh_keys(repo, GIT_SSH_COMMAND_PATH, SSH_PRIV_KEY_PATH, # SSH_PUB_KEY_PATH, SSH_PUB_KEY_SERVER_PATH) sys.exit()
def run_water_spider(startDate, endDate, **kwargs): water_crawlerProcess = CrawlerProcess(settings) water_crawlerProcess.install() water_crawlerProcess.configure() spider = WaterSpider("8735180", sys.argv[1], sys.argv[2]) water_crawlerProcess.crawl(spider) try: water_crawlerProcess.start() water_crawlerProcess.stop() water_crawlerProcess.uninstall() except Exception as e: print e
def run_wind_spider(startDate, endDate, **kwargs): wind_crawlerProcess = CrawlerProcess(settings) wind_crawlerProcess.install() wind_crawlerProcess.configure() spider2 = WindSpider("dpia1", sys.argv[1], sys.argv[2]) wind_crawlerProcess.crawl(spider2) try: wind_crawlerProcess.start() wind_crawlerProcess.stop() wind_crawlerProcess.uninstall() except Exception as e: print e
def runscrapy(stationID, startDate, endDate, **kwargs): crawlerProcess = CrawlerProcess(settings) crawlerProcess.install() crawlerProcess.configure() spider = Spider(stationID, startDate, endDate) crawlerProcess.crawl(spider) try: crawlerProcess.start() crawlerProcess.stop() crawlerProcess.uninstall() except Exception as e: print e
def start_my_crawl(builder): settings = Settings() # settings.set('DEPTH_LIMIT', 1) settings.set("ITEM_PIPELINES", { # 'app.pipelines.JsonWriterPipeline': 200, 'app.pipelines.DataBasePipeline': 300, }) process = CrawlerProcess(settings) process.crawl(MyCrawlSpider, builder=builder) process.start() print('process.stop()') process.stop()
def _runCrawler(spider, results): settings_module = importlib.import_module('Extractors.HTMLScraper.settings') settings = CrawlerSettings(settings_module) crawlerProcess = CrawlerProcess(settings) items = [] def _item_passed(item, response, spider): items.append(item) dispatcher.connect(_item_passed, signals.item_scraped) crawler = crawlerProcess.create_crawler("currentCrawler") crawler.crawl(spider) crawlerProcess.start() crawlerProcess.stop() results.put(items)
class CrawlerWorker(Process): def __init__(self, spider, results, **kwargs): Process.__init__(self) self.results = results self.crawler = CrawlerProcess(get_project_settings()) self.items = [] self.spider = spider self.kwargs = kwargs dispatcher.connect(self._item_scraped, signals.item_scraped) def _item_scraped(self, item, spider): self.items.append(item) def run(self): self.crawler.crawl(self.spider, **self.kwargs) self.crawler.start() self.crawler.stop() self.results.put(self.items)
class CrawlerWorker(Process): def __init__(self, spider,query,results): Process.__init__(self) self.results = results self.items = [] self.query = query self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def run(self): #self.crawler = CrawlerProcess(get_project_settings()) self.crawler = CrawlerProcess(Settings()) self.crawler.crawl(self.spider,self.query) self.crawler.start() self.crawler.stop() self.results.put(self.items)
class CrawlerScript(): def __init__(self, spider, results): self.results = results self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def run(self): self.crawler.crawl(self.spider) self.crawler.start() self.crawler.stop() self.results.put(self.items)
class CrawlerScript(): def __init__( self ): self.crawler = CrawlerProcess( Settings() ) self.crawler.install() self.crawler.configure() def _crawl( self, queue, search ): log.start( loglevel = log.DEBUG ) current_spider = CraigslistSpider() if search: current_spider.set_search_url( search ) self.crawler.crawl( current_spider ) self.crawler.start() self.crawler.stop() queue.put( current_spider.get_object_list() ) def crawl( self, search = "" ): q = Queue() p = Process( target = self._crawl, args = ( q, search ) ) p.start() p.join() return q.get()
class RunCrawler(): """RunCrawler runs a crawler in a separate process. Useful sources: https://groups.google.com/forum/?fromgroups#!topic/scrapy-users/8zL8W3SdQBo http://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script """ def __init__(self, settings): self.crawler = CrawlerProcess(settings) self.crawler.configure() def _crawl(self, spider): self.crawler.crawl(spider) self.crawler.start() self.crawler.stop() def crawl(self, spider): p = Process(target=self._crawl, args=(spider,)) p.start() p.join()
class CrawlerWorker(): def __init__(self): self.process = CrawlerProcess(Settings({ 'ITEM_PIPELINES': { 'pipelines.MongoDBPipeline': 1 }, 'MONGO_URI': 'mongodb://localhost:27017/', 'MONGO_DATABASE': 'realto' })) self.items = [] dispatcher.connect(self.item_passed, signals.item_passed) def item_passed(self, item): self.items.append(item) def run(self, config): self.process.crawl(IrrSpider, **config) self.process.start() self.process.stop() return self.items
class CrawlerScript(): def __init__(self): self.crawler = CrawlerProcess(settings) def _item_passed(self, item): self.items.append(item) def _crawl(self, queue, spider_name): spider = self.crawler.spiders.create(spider_name) if spider: self.crawler.queue.append_spider(spider) self.crawler.start() self.crawler.stop() queue.put(self.items) def crawl(self, spider): queue = Queue() p = Process(target=self._crawl, args=(queue, spider,)) p.start() p.join() return queue.get(True)
class CrawlerWorker(Process): def __init__(self, spider, results): Process.__init__(self) self.results = results settings_module = importlib.import_module('Extractors.HTMLScraper.settings') settings = CrawlerSettings(settings_module) self.crawlerProcess = CrawlerProcess(settings) self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def run(self): crawler = self.crawlerProcess.create_crawler("currentCrawler") crawler.crawl(self.spider) self.crawlerProcess.start() self.crawlerProcess.stop() self.results.put(self.items)
class DomainCrawlerScript(): def __init__(self): self.crawler = CrawlerProcess(settings) self.crawler.install() self.crawler.configure() def _crawl(self, domain_pk): domain = Domain.objects.get( pk = domain_pk, ) urls = [] for page in domain.pages.all(): urls.append(page.url()) self.crawler.crawl(DomainSpider(urls)) self.crawler.start() self.crawler.stop() def crawl(self, domain_pk): p = Process(target=self._crawl, args=[domain_pk]) p.start() p.join()
class AccountCrawler(): def __init__(self): self.crawler = CrawlerProcess(settings) def _crawl(self, origin_oj, username): self.crawler.crawl( origin_oj + '_user', username=username, ) self.crawler.start() self.crawler.stop() def crawl(self, origin_oj, username): p = Process( target=self._crawl, args=[ origin_oj, username, ] ) p.start() p.join()
class CrawlerWorker(Process): def __init__(self, spider, result_list, settings=None): Process.__init__(self) self.result_queue = result_list if settings is None: settings = Settings() self.crawler = CrawlerProcess(settings) self.crawler.create_crawler(spider.__class__.__name__) self.crawler.crawlers['spider'] = spider self.spider = spider self.items = [] dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) print "here" def run(self): self.crawler.start() self.crawler.stop() self.result_queue.put(self.items)
class CrawlerWorker(multiprocessing.Process): def __init__(self, result_queue, url): multiprocessing.Process.__init__(self) self.result_queue = result_queue self.url=url self.crawler = CrawlerProcess(Settings) #if not hasattr(project, 'crawler'): # self.crawler.install() self.crawler.configure() self.items = [] self.spider = RecipeSpider(url) dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def run(self): self.crawler.crawl(self.spider) self.crawler.start() self.crawler.stop() self.result_queue.put(self.items)
def _run_crawl_process(**kwargs): #log.start must be explicitly called log.start(loglevel=getattr(django_settings, 'SCRAPY_LOG_LEVEL', 'INFO')) # region How to run a crawler in-process # examples on how to get this stuff: # http://stackoverflow.com/questions/14777910/scrapy-crawl-from-script-always-blocks-script-execution-after-scraping?lq=1 # http://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script # http://stackoverflow.com/questions/7993680/running-scrapy-tasks-in-python # http://stackoverflow.com/questions/15564844/locally-run-all-of-the-spiders-in-scrapy # https://groups.google.com/forum/#!topic/scrapy-users/d4axj6nPVDw # endregion crawler = CrawlerProcess(settings) crawler.install() crawler.configure() spider = crawler.spiders.create(kwargs['spider'], **kwargs) crawler.crawl(spider) log.msg('Spider started...') crawler.start() log.msg('Spider stopped.') crawler.stop()
class Processor(Process): """Start a twisted reactor and run the provided scrapy spiders. Blocks until all have finished. """ def __init__(self, settings=None): """ Parms: settings (scrapy.settings.Settings) - settings to apply. Defaults to Scrapy default settings. """ kwargs = {"ctx": __import__("billiard.synchronize")} self.results = Queue(**kwargs) self.items = [] self.settings = settings or Settings() dispatcher.connect(self._item_scraped, signals.item_scraped) def _item_scraped(self, item): self.items.append(item) def _crawl(self, requests): """ Parameters: requests (Request) - One or more Jobs. All will be loaded into a single invocation of the reactor. """ self.crawler = CrawlerProcess(self.settings) # crawl can be called multiple times to queue several requests for req in requests: self.crawler.crawl(req.spider, *req.args, **req.kwargs) self.crawler.start() self.crawler.stop() self.results.put(self.items) def run(self, jobs): """Start the Scrapy engine, and execute all jobs. Return consolidated results in a single list. Parms: jobs ([Job]) - one or more Job objects to be processed. Returns: List of objects yielded by the spiders after all jobs have run. """ if not isinstance(jobs, collections.abc.Iterable): jobs = [jobs] self.validate(jobs) p = Process(target=self._crawl, args=[jobs]) p.start() results = self.results.get() p.join() p.terminate() return results def validate(self, jobs): if not all([isinstance(x, Job) for x in jobs]): raise ScrapyScriptException("scrapyscript requires Job objects.")
def f(self, urls, _video=None): process = CrawlerProcess(get_project_settings()) process.crawl(Moocspider, urls=urls, video=_video) process.start(stop_after_crawl=True) process.stop()
class CrawlerProcessScript(object): """ Creates multiple crawlers and call them sequentially Crawler names should follow this naming convention: spider_name + _ + city + _ + category crawlers : keeps track of all crawlers run, so to get their stats after they are finished. """ def __init__(self, dsite_name='', updating=False): self.updating = str(updating) self.dsite = DSite.objects.get(name=dsite_name) self.crawler_process = CrawlerProcess(get_project_settings()) self.crawlers = {} def _add_crawler(self, crawler_name, city_mapping_pk=None, category_mapping_pk=None): crawler = self.crawler_process.create_crawler(crawler_name) spider = crawler.spiders.create( self.dsite.name, dsite_pk=self.dsite.pk, city_mapping_pk=city_mapping_pk, category_mapping_pk=category_mapping_pk, updating=self.updating) crawler.crawl(spider) self.crawlers[crawler_name] = crawler def _create_crawlers(self): if self.dsite.has_both_mappings: for city_mapping in CityMapping.objects.filter(dsite=self.dsite): for category_mapping in CategoryMapping.objects.filter( dsite=self.dsite, all_cities=False): crawler_name = self.dsite.name + '_' + city_mapping.site_city + '_' + category_mapping.site_category self._add_crawler(crawler_name=crawler_name, category_mapping_pk=category_mapping.pk, city_mapping_pk=city_mapping.pk) if self.dsite.has_category_mapping: for category_mapping in CategoryMapping.objects.filter( dsite=self.dsite, all_cities=True): crawler_name = self.dsite.name + '_' + category_mapping.site_category self._add_crawler(crawler_name=crawler_name, category_mapping_pk=category_mapping.pk) elif self.dsite.has_city_mapping: for city_mapping in CityMapping.objects.filter(dsite=self.dsite): crawler_name = self.dsite.name + '_' + city_mapping.site_city self._add_crawler(crawler_name=crawler_name, city_mapping_pk=city_mapping.pk) if self.dsite.has_category_mapping: for category_mapping in CategoryMapping.objects.filter( dsite=self.dsite, all_cities=True): crawler_name = self.dsite.name + '_' + category_mapping.site_category self._add_crawler(crawler_name=crawler_name, category_mapping_pk=category_mapping.pk) elif self.dsite.has_category_mapping: for category_mapping in CategoryMapping.objects.filter( dsite=self.dsite): crawler_name = self.dsite.name + '_' + category_mapping.site_category self._add_crawler(crawler_name=crawler_name, category_mapping_pk=category_mapping.pk) def start(self): self._create_crawlers() self.crawler_process.start() self.crawler_process.stop() self.crawler_process.stop_reactor() def dump_stats(self): for crawler_name, crawler in self.crawlers.iteritems(): print crawler_name print crawler.stats.get_stats()
def run_config(config): config = ConfigLoader(config) CustomMiddleware.driver = config.driver DocumentationSpider.NB_INDEXED = 0 if config.use_anchors: from . import scrapy_patch strategy = DefaultStrategy(config) algolia_helper = AlgoliaHelper( config.app_id, config.api_key, config.index_name, AlgoliaSettings.get(config, strategy.levels)) DOWNLOADER_MIDDLEWARES_PATH = 'scraper.src.custom_middleware.CustomMiddleware' DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.src.scrapy_patch.CustomContextFactory' if __name__ == '__main__': DOWNLOADER_MIDDLEWARES_PATH = 'src.custom_middleware.CustomMiddleware' DOWNLOADER_CLIENTCONTEXTFACTORY = 'src.scrapy_patch.CustomContextFactory' process = CrawlerProcess({ 'LOG_ENABLED': '1', 'LOG_LEVEL': 'ERROR', # 'LOG_LEVEL': 'DEBUG', 'USER_AGENT': config.user_agent, 'DOWNLOADER_MIDDLEWARES': { DOWNLOADER_MIDDLEWARES_PATH: 900 }, # Need to be > 600 to be after the redirectMiddleware 'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY }) process.crawl(DocumentationSpider, config=config, algolia_helper=algolia_helper, strategy=strategy) process.start() process.stop() # Kill browser if needed BrowserHandler.destroy(config.driver) if len(config.extra_records) > 0: algolia_helper.add_records(config.extra_records, "Extra records") if len(Camelizer.synonyms) > 0: algolia_helper.add_synonyms(Camelizer.synonyms) print("") if DocumentationSpider.NB_INDEXED > 0: algolia_helper.commit_tmp_index() print('Nb hits: ' + str(DocumentationSpider.NB_INDEXED)) config.update_nb_hits(DocumentationSpider.NB_INDEXED) else: print('Crawling issue: nbHits 0 for ' + config.index_name) algolia_helper.report_crawling_issue() print("")
def runScrape( page="", jumps=0 ): #this procedure is responsible for instantiating the scraper and scraping the required webpage #tries and connect too database so the scraper doesn't run without the database being ready. try: mydb = mysql.connector.connect( # connects to database host="localhost", user=username, password=password, database='websites', auth_plugin='mysql_native_password') except mysql.connector.InterfaceError: print("""The connection to the database has been unsuccessful. Please make sure the SQL server is running, and the database has been initialised. To initialise database please type \"CREATE DATABASE websites;\" in a suitable SQL terminal and make sure the admin username and password have been entered into credentials.txt""" ) sys.exit() mydb.close() if jumps <= 0: #checks if the max number of jumps has been modified from the default value validInput = False while validInput == False: jumps = input( "Please input the max number of jumps to be performed by the scraper \n" ) try: #we only check if the user is inputting text here as the terminal only system checks automatically int( jumps ) #tries to convert the user input to an integer. If a type error occurs then the input was not an integer and need to be received from user again if int(jumps) > 0 and float( jumps ) % 1 == 0: #using modulus to check if whole number. validInput = True else: print("Please input a valid positive integer.") except ValueError: validInput = False print("Please input a valid positive integer.") if page == "": #checks if the page has been passed as a parameter and if it hasn't then executes the following code website = input("Please input the website you wish to scrape: ") else: website = page # trims away anything that trails after the first / and all references to http or https making it into a domain domain = website.replace("https://", "").replace("http://", "").split("/", 1)[0] #this part starts up the scraper with all the required parameters #region scraper start ScraperWithLimit.allowed_domains = [domain] ScraperWithLimit.start_urls = [website] ScraperWithLimit.custom_settings = { 'DEPTH_LIMIT': jumps, 'DEPTH_PRIORITY': 0, } process = CrawlerProcess() process.crawl(ScraperWithLimit) process.start() process.stop() #endregion #region database setup try: mydb = mysql.connector.connect( # connects to database host="localhost", user=username, password=password, database='websites', auth_plugin='mysql_native_password') except mysql.connector.InterfaceError: print("""The connection to the database has been unsuccessful. Please make sure the SQL server is running, and the database has been initialised. To initialise database please type \"CREATE DATABASE websites;\" in a suitable SQL terminal and make sure the admin username and password have been entered into credentials.txt""" ) sys.exit() mycursor = mydb.cursor() #initialises cursor so that commands can be sent # drops the table if it already exists mycursor.execute("DROP TABLE IF EXISTS `%s`;" % domain) time.sleep( .25 ) #sleeps the thread as the deletion actually overlaps with the creation # creates a table with the name of the domain being scraped. mycursor.execute( "CREATE TABLE `%s`(AutoID INT NOT NULL AUTO_INCREMENT PRIMARY KEY, OriginURL VARCHAR(2000) NOT NULL, Hyperlink VARCHAR(2000) NOT NULL);" % (domain)) time.sleep( .25 ) #as stuff is executed asynchronously this pause is needed to make sure the SQL statements are executed in correct order mydb.commit() #endregion # backticks are used so that any character can be accepted aka the . in the URL. The surrounding '' are used so that MySQL doesn't mistake them for table references query = "INSERT INTO `" + domain + "` VALUES (NULL, %s, %s);" if domain[-1] == "/": #this removes a trailing slash at the end of URLs domain = domain[:-1] #this is so that any values that have the domain appended later will also contain the domains transfer protocol if "https://" in website: domain = "https://" + domain elif "http://" in website: domain = "http://" + domain print("Writing to database. This may take a while.") for originURL, hyperlinks in dictOfUrl.items(): #removes the trailing slash to make sure all links are identical. (http://blah.com/ is the same as http://blah.com for example, but when string comparisons are done, they are different.) if originURL[-1] == "/": originURL = originURL[:-1] for item in hyperlinks: if item != "": if len(item) > 1 or "http" in item: #ignores all anchor links if item[-1] == "/": #removes any trailing slashes. item = item[:-1] if "http" in item: queryParameters = ( originURL, item, ) mycursor.execute(query, queryParameters) elif item[0] != '#' and item[ 0] == "/": #if http is not in the item and # is not the first char we can assume that it is a relative link queryParameters = ( originURL, domain + item, ) #appends the domain name to relative paths # actually executes the SQL statement with the parameters place of %s. This method also removes any SQL injection attempts mycursor.execute(query, queryParameters) # testFile.write(originURL + ", " + domain + item + "\n") #test statement to see if any of this even works elif item != '#': queryParameters = ( originURL, domain + '/' + item, ) #appends the domain name and a slash to relative paths that are using interactive link. Seems to be rare but some websites do have it mycursor.execute(query, queryParameters) mydb.commit() #commits the changes to the database elif item == "/": #this is to make sure that a / redirects to the home page. queryParameters = (originURL, domain) mycursor.execute(query, queryParameters) mydb.commit() mycursor.close() mydb.close() return True #this is done so that there can be confirmation that the program has stopped running as problems arose due to seeming asynchronous execution of select statements while values were being inserted
class VineyardSpider: def __init__(self, driver: webdriver, destination: str, log=True): """Start an instance of the VineyardSpider, which can then be used to download data from http://www.biodynamicfood.org/""" self.driver = driver self.destination = destination self.log = log self._create_csv() self.time = None self.process = None def _create_csv(self): if not os.path.isfile(self.destination): with open(self.destination, 'w') as output: writer = csv.writer(output) writer.writerow([ 'Name', 'Date', 'Category', 'Address', 'Phone', 'Email', 'Website', 'Short description', 'Description', 'Crops', 'Processed products', 'Cropped_acreage', 'Total_acreage' ]) else: pass def load_vineyards(self, link: str, time: date): self.time = time self.driver.get(link) # Select category 'Crops' product_selector = self.driver.find_element_by_id('filter_3_primary') product_selector.click() product_selector.send_keys(Keys.ARROW_DOWN) product_selector.send_keys(Keys.ENTER) sleep(0.3) if self.log: print('Loading category "Crops" successful.') # Select subcategory 'Fruit crop_selector = self.driver.find_element_by_id('filter_3_secondary') crop_selector.click() crop_selector.send_keys(Keys.ARROW_DOWN) crop_selector.send_keys(Keys.ARROW_DOWN) crop_selector.send_keys(Keys.ENTER) sleep(0.3) if self.log: print('Loading subcategory "Fruit" successful.') # Select 'Grapes For Wine' fruit_selector = self.driver.find_element_by_id('filter_3_tertiary') fruit_selector.click() for i in range(19): sleep(0.1) fruit_selector.send_keys(Keys.ARROW_DOWN) fruit_selector.send_keys(Keys.ENTER) if self.log: print('Loading "Grapes For Wine" successful.') # Load all producers load_button = self.driver.find_element_by_id('scrollDown') n_elements = len( self.driver.find_elements_by_class_name('results_list_item')) while True: if self.log: print('Loading more vineyards.') for i in range(20): load_button.click() sleep(0.1) n_elements_new = len( self.driver.find_elements_by_class_name('results_list_item')) if n_elements < n_elements_new: n_elements = n_elements_new else: if self.log: print('Loaded all vineyards.') break def get_vineyard_links(self): selector = Selector(text=self.driver.page_source) links = [ 'http://www.biodynamicfood.org' + link for link in selector.xpath( '//*[@class="results_list_item"]//a/@href').extract() ] return links def prepare_vineyard_parsing(self, links): destination = self.destination time = self.time class GenericSpider(Spider): name = 'vineyards' allowed_domains = [] start_urls = links def parse(self, response): sel = Selector(response) name = sel.xpath('//h1/text()').extract_first() category = sel.xpath( '//h2[@class="business-type"]/text()').extract_first() address_field_1 = sel.xpath( '//div[@class="member-address"]/p/text()[1]' ).extract_first().strip() address_field_2 = sel.xpath( '//div[@class="member-address"]/p/text()[2]' ).extract_first().strip() address = address_field_1 + '\n' + address_field_2 contact_info = sel.xpath( '//div[@class="member-address"]/p/text()').extract() contact_info = [line.strip() for line in contact_info] phone = [ line for line in contact_info if line.startswith('Phone: ') ][0] phone = phone.replace('Phone: ', '') email = sel.xpath('//div[@class="member-address"]//a[1]/text()' ).extract_first() website = sel.xpath( '//div[@class="member-address"]//a[2]/text()' ).extract_first() short_description = sel.xpath( '//p[@class="quote"]/text()').extract_first() profile = sel.xpath( '//div[@class="member-profile"]/div/p/text()').extract() profile = [element.strip() for element in profile] len_description = max([len(element) for element in profile]) description = [ element for element in profile if len(element) == len_description ][0] crops = sel.xpath( '//div[p/*/text()="Crops"]//li//text()').extract() crops = ', '.join(crops) processed_products = sel.xpath( '//div[p/*/text()="Processed Product"]//li//text()' ).extract() processed_products = ', '.join(processed_products) all_text = sel.xpath('//p/text()').extract() all_text = [text.strip() for text in all_text] acreage = [text for text in all_text if 'Acres' in text] try: cropped_acreage = acreage[0] total_acreage = acreage[1] except IndexError: print('Acreage not specified for one organization.') cropped_acreage = '' total_acreage = '' with open(destination, 'a', newline='') as output: writer = csv.writer(output) writer.writerow([ name, time, category, address, phone, email, website, short_description, description, crops, processed_products, cropped_acreage, total_acreage ]) # Run spider self.process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) self.process.crawl(GenericSpider) def parse_organizations(self): self.process.start() def close_parser(self): self.process.stop()
def execute(cat_urls): process = CrawlerProcess(get_project_settings()) process.crawl(ZoomSpider, cat_urls=cat_urls) process.start() process.stop()
def main(): settings = get_project_settings() parser = argparse.ArgumentParser() parser.add_argument('-d', '--debug', help='Set logging level to debug', action='store_true') parser.add_argument('-v', '--version', action='version', help='version', version='%(prog)s ' + version) parser.add_argument('-o', '--outputdir', help='output local path', default=conf.DATA_LOCAL_REPO_PATH) parser.add_argument('-c', '--crawl', help='Crawl AFAQ.', action='store_true') parser.add_argument('-g', '--pull', help='Pull git repo before obtaining AFAQ.', action='store_true') parser.add_argument('-p', '--push', help='Push to git repo AFAQ changes.', action='store_true') parser.add_argument('-r', '--rm', help='Remove content outputdir before crawling.', action='store_true') parser.add_argument('-m', '--convert', help='Convert obtained AFAQ to other formats.', action='store_true') parser.add_argument('-a', '--all', help='Equivalent to -g, -c, -p, -r, -m.', action='store_true', default=True) args = parser.parse_args() if args.all is True: args.pull = args.crawl = args.push = args.convert = args.rm = \ args.debug = True # configure_logging() logging.basicConfig(format=conf.LOG_FORMAT) logging.getLogger('scrapy').propagate = False logger = logging.getLogger('root') logger.propagate = True if args.debug is True: logger.setLevel(logging.DEBUG) # settings.set('DEBUG', True) # conf.DEBUG = True if args.pull is True: # Write ssh keys and command neede for git_utils if not os.path.isdir(conf.SSH_PATH): os.makedirs(conf.SSH_PATH) logger.debug('Created ssh dir: %s.', conf.SSH_PATH) if system.ismorpio(): git_utils.write_ssh_keys(conf.SSH_DIR, conf.MORPH_SSH_PRIV_KEY_ENV, conf.MORPH_SSH_PUB_KEY_ENV, conf.SSH_PRIV_KEY_PATH, conf.SSH_PUB_KEY_PATH) git_utils.write_ssh_command(conf.GIT_SSH_COMMAND_PATH, conf.GIT_SSH_COMMAND_MORPHIO) else: git_utils.write_ssh_command(conf.GIT_SSH_COMMAND_PATH, conf.GIT_SSH_COMMAND) git_utils.write_ssh_key_server(conf.GITLAB_SSH_PUB_KEY, conf.SSH_PUB_KEY_SERVER_PATH) # Pull or clone the data repos logger.debug('Remote repo name %s' % conf.DATA_REMOTE_REPO.get('name')) local_repo, remote_repo = git_utils.obtain_repo( conf.DATA_LOCAL_REPO_PATH, conf.DATA_REMOTE_REPO, conf.GIT_SSH_COMMAND_PATH, False) if args.rm is True: # rm files in case they are deleted in the sources # TODO: if this is removed then the files removed should be # detected on git commit system.rm_data(conf.DATA_LOCAL_REPO_PATH) if args.crawl is True: # Run the scraper process = CrawlerProcess(settings) process.crawl('afaq') process.start() process.stop() if args.convert is True: # Conversions convert.convert_dir(conf.HTML_PATH, conf.MD_PATH, convert.html2md, '.md') # NOTE: since there is already md, txt is not needed #convert.html2txt(conf.HTML2TXT_COMMAND, conf.HTML_PATH) if args.push is True: # Push the scraped data in the repos git_utils.commit_push_if_changes(local_repo, conf.GIT_AUTHOR_NAME, conf.GIT_AUTHOR_EMAIL, conf.GIT_SSH_COMMAND_PATH, conf.DATA_REMOTE_REPO, conf.METADATA_PATH)
if __name__ == '__main__': load_dotenv(find_dotenv()) trademarks_list_file = os.getenv("TRADEMARKS") timeframe = os.getenv("TIMEFRAME") configure_logging(install_root_handler=False) session_dir = os.path.abspath(os.getenv("SESSIONDIR")) mkdir_p(session_dir) log_dir = os.path.abspath(os.getenv("LOGDIR", os.path.join(SCRIPT_DIR, 'Log'))) log_file_name = 'feefo_' + str(datetime.utcnow())[:19].replace('-', '').replace(':', '').replace(' ', '_') + '.log' mkdir_p(log_dir) logging.basicConfig( filename=os.path.join(log_dir, log_file_name), format='%(levelname)s: %(message)s', level=logging.INFO ) if trademarks_list_file and timeframe: try: with open(os.path.abspath(trademarks_list_file), 'r') as f: trademarks = f.readlines() except Exception as exc: print(f'File {trademarks_list_file} open error: {exc}') raise SystemExit else: start_urls = [ FeefoSpider.base_url.format(trademark=trademark.strip(), timeframe=timeframe) for trademark in trademarks] process = CrawlerProcess(get_project_settings()) process.crawl(FeefoSpider, start_urls=start_urls, session_dir=session_dir, connection=Connection() ) process.start() process.stop()
def run(self, in_theater_ids, out_theater_ids): process = CrawlerProcess(self.scrapy_settings) process.crawl('get_movies', in_theater_ids, out_theater_ids) process.start() process.stop()
def execute(urls): process = CrawlerProcess(get_project_settings()) process.crawl(KabumSpider, categories=urls) process.start() process.stop()
def _crawl(path=None): crawl = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) crawl.crawl(ProvinceSpider) crawl.start() crawl.stop()
def execute(tags_urls): process = CrawlerProcess({'LOG_LEVEL': LOG_LEVEL}) process.crawl(TagSpider, tags=';'.join(tags_urls)) process.start() process.stop()
def run_config(config): config = ConfigLoader(config) CustomDownloaderMiddleware.driver = config.driver DocumentationSpider.NB_INDEXED = 0 strategy = DefaultStrategy(config) meilisearch_helper = MeiliSearchHelper( config.app_id, config.api_key, config.index_uid, config.custom_settings ) root_module = 'src.' if __name__ == '__main__' else 'scraper.src.' DOWNLOADER_MIDDLEWARES_PATH = root_module + 'custom_downloader_middleware.' + CustomDownloaderMiddleware.__name__ DOWNLOADER_CLIENTCONTEXTFACTORY = root_module + 'scrapy_patch.' + CustomContextFactory.__name__ DUPEFILTER_CLASS_PATH = root_module + 'custom_dupefilter.' + CustomDupeFilter.__name__ headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en", } # Defaults for scrapy https://docs.scrapy.org/en/latest/topics/settings.html#default-request-headers if os.getenv("CF_ACCESS_CLIENT_ID") and os.getenv("CF_ACCESS_CLIENT_SECRET"): headers.update( { "CF-Access-Client-Id": os.getenv("CF_ACCESS_CLIENT_ID"), "CF-Access-Client-Secret": os.getenv("CF_ACCESS_CLIENT_SECRET"), } ) elif os.getenv("IAP_AUTH_CLIENT_ID") and os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON"): iap_token = IAPAuth( client_id=os.getenv("IAP_AUTH_CLIENT_ID"), service_account_secret_dict=json.loads( os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON") ), )(requests.Request()).headers["Authorization"] headers.update({"Authorization": iap_token}) DEFAULT_REQUEST_HEADERS = headers process = CrawlerProcess({ 'LOG_ENABLED': '1', 'LOG_LEVEL': 'ERROR', 'USER_AGENT': config.user_agent, 'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900}, # Need to be > 600 to be after the redirectMiddleware 'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY, 'DUPEFILTER_USE_ANCHORS': config.use_anchors, # Use our custom dupefilter in order to be scheme agnostic regarding link provided 'DUPEFILTER_CLASS': DUPEFILTER_CLASS_PATH, 'DEFAULT_REQUEST_HEADERS': DEFAULT_REQUEST_HEADERS, }) process.crawl( DocumentationSpider, config=config, meilisearch_helper=meilisearch_helper, strategy=strategy ) process.start() process.stop() # Kill browser if needed BrowserHandler.destroy(config.driver) if len(config.extra_records) > 0: meilisearch_helper.add_records(config.extra_records, "Extra records", False) print("") if DocumentationSpider.NB_INDEXED > 0: # meilisearch_helper.commit_tmp_index() print('Nb hits: {}'.format(DocumentationSpider.NB_INDEXED)) config.update_nb_hits_value(DocumentationSpider.NB_INDEXED) else: print('Crawling issue: nbHits 0 for ' + config.index_uid) # meilisearch_helper.report_crawling_issue() exit(EXIT_CODE_NO_RECORD) print("")
import sched import time from scrapy.crawler import CrawlerProcess from finan.spiders.yobit_spider import YobitSpider from scrapy.utils.project import get_project_settings process = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) process.crawl(YobitSpider) process.start() del process scheduler = sched.scheduler(time.time, time.sleep) waiting = 10 while True: repeated = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) repeated.crawl(YobitSpider) scheduler.enter(waiting, 2, repeated.start) scheduler.run() print( "\n waiting for %d seconds before more action. Pres CTRL+Z to cancel\n" % (waiting)) repeated.stop() del repeated
class Processor(Process): ''' Start a twisted reactor and run the provided scrapy spiders. Blocks until all have finished. ''' def __init__(self, settings=None, item_scraped=True): ''' Parms: settings (scrapy.settings.Settings) - settings to apply. Defaults to Scrapy default settings. ''' kwargs = {'ctx': __import__('billiard.synchronize')} self.results = Queue(**kwargs) self.counts = Queue(**kwargs) self.items = {} self.items_count = {} self.settings = settings or Settings() self.item_scraped = item_scraped dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item, response, spider): if spider.name not in self.items.keys(): self.items[spider.name] = [] self.items_count[spider.name] = 0 if self.item_scraped is True: self.items[spider.name].append(dict(item)) self.items_count[spider.name] += 1 def _crawl(self, requests): ''' Parameters: requests (Request) - One or more Jobs. All will be loaded into a single invocation of the reactor. ''' self.crawler = CrawlerProcess(self.settings) # crawl can be called multiple times to queue several requests for req in requests: self.crawler.crawl(req.spider, *req.args, **req.kwargs) self.crawler.start() self.crawler.stop() self.results.put(self.items) self.counts.put(self.items_count) def run(self, jobs): '''Start the Scrapy engine, and execute all jobs. Return consolidated results in a single list. Parms: jobs ([Job]) - one or more Job objects to be processed. Returns: List of objects yielded by the spiders after all jobs have run. ''' if not isinstance(jobs, collections.Iterable): jobs = [jobs] self.validate(jobs) p = Process(target=self._crawl, args=[jobs]) p.start() p.join() p.terminate() def data(self): return self.results.get() def count(self): return self.counts.get() def validate(self, jobs): if not all([isinstance(x, Job) for x in jobs]): raise ScrapyScriptException('scrapy-script requires Job objects.')
def run_config(config): config = ConfigLoader(config) CustomDownloaderMiddleware.driver = config.driver DocumentationSpider.NB_INDEXED = 0 if config.use_anchors: from . import scrapy_patch strategy = DefaultStrategy(config) algolia_helper = AlgoliaHelper( config.app_id, config.api_key, config.index_name, AlgoliaSettings.get(config, strategy.levels), config.query_rules) DOWNLOADER_MIDDLEWARES_PATH = 'scraper.src.custom_downloader_middleware.CustomDownloaderMiddleware' DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.src.scrapy_patch.CustomContextFactory' DUPEFILTER_CLASS_PATH = 'scraper.src.custom_dupefilter.CustomDupeFilter' if __name__ == '__main__': DOWNLOADER_MIDDLEWARES_PATH = 'src.custom_downloader_middleware.CustomDownloaderMiddleware' DOWNLOADER_CLIENTCONTEXTFACTORY = 'src.scrapy_patch.CustomContextFactory' DUPEFILTER_CLASS_PATH = 'src.custom_dupefilter.CustomDupeFilter' process = CrawlerProcess({ 'LOG_ENABLED': '1', 'LOG_LEVEL': 'ERROR', 'USER_AGENT': config.user_agent, 'DOWNLOADER_MIDDLEWARES': { DOWNLOADER_MIDDLEWARES_PATH: 900 }, # Need to be > 600 to be after the redirectMiddleware 'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY, 'DUPEFILTER_USE_ANCHORS': config.use_anchors, # Use our custom dupefilter in order to be scheme agnostic regarding link provided 'DUPEFILTER_CLASS': DUPEFILTER_CLASS_PATH }) process.crawl(DocumentationSpider, config=config, algolia_helper=algolia_helper, strategy=strategy) process.start() process.stop() # Kill browser if needed BrowserHandler.destroy(config.driver) if len(config.extra_records) > 0: algolia_helper.add_records(config.extra_records, "Extra records") print("") if DocumentationSpider.NB_INDEXED > 0: algolia_helper.commit_tmp_index() print('Nb hits: ' + str(DocumentationSpider.NB_INDEXED)) config.update_nb_hits_value(DocumentationSpider.NB_INDEXED) else: print('Crawling issue: nbHits 0 for ' + config.index_name) algolia_helper.report_crawling_issue() exit(EXIT_CODE_NO_RECORD) print("")