def test_create_valid_job(self): spider = Spider(name='testing') job = Job(spider) self.assertIsInstance(job.spider, Spider) job2 = Job(spider, {'data': 0}) self.assertEqual(job2.payload['data'], 0)
def scrape(self, ind): db = FirebaseAccess() urls1 = [ 'https://www.class-central.com/subject/cs', 'https://www.class-central.com/subject/business', 'https://www.class-central.com/subject/science', 'https://www.class-central.com/subject/data-science', 'https://www.class-central.com/subject/programming-and-software-development', 'https://www.class-central.com/subject/engineering', 'https://www.class-central.com/subject/maths' ] urls2 = [ 'https://www.class-central.com/subject/humanities', 'https://www.class-central.com/subject/social-sciences', 'https://www.class-central.com/subject/education', 'https://www.class-central.com/subject/personal-development', 'https://www.class-central.com/subject/art-and-design', 'https://www.class-central.com/subject/health' ] if ind == 1: sched = Job(CoursesSpider, fbadb=db, urls_to_scrape=urls1) else: sched = Job(CoursesSpider, fbadb=db, urls_to_scrape=urls2) processor = Processor(settings=None) data = processor.run([sched])
def test_mulitple_jobs(self): jobs = [ Job(MySpider, url='http://www.python.org'), Job(MySpider, url='http://www.github.com') ] results = Processor().run(jobs) self.assertEqual(len(results), 4)
def test_mulitple_jobs(self): jobs = [ Job.from_xpath('http://www.python.org', '//title/text()'), Job.from_xpath('http://www.python.org', '//title/text()'), ] results = Processor().run(jobs) self.assertEqual(len(results), 2)
def test_multiple_jobs_return_job_specific_data_in_each_result(): jobs = [ Job(TitleSpider, url="http://www.python.org"), Job(TitleSpider, url="http://www.github.com"), ] results = Processor().run(jobs) data = [item["data"].lower() for item in results] assert any("python" in s for s in data) assert any("github" in s for s in data) assert len(results) == 2
def test_settings_flow_through_to_spider(self): settings = Settings() settings['BOT_NAME'] = 'alpha' job = Job(MySpider, url='http://www.python.org') results = Processor(settings=settings).run(job) self.assertIn({'bot': 'alpha'}, results)
def test_settings_flow_through_to_spider(): settings = Settings() settings["BOT_NAME"] = "alpha" job = Job(ItemSpider, url="http://www.python.org") results = Processor(settings=settings).run(job) assert results[0]["bot"] == "alpha"
def test_settings_flow_through_to_spider(self): settings = Settings() settings['BOT_NAME'] = 'alpha' job = Job(TestSpider()) results = Processor(settings=settings).run(job) self.assertIn({'bot': 'alpha'}, results)
def test_run_calls_process_join_terminate(mocker): mock_proc = mocker.patch("scrapyscript.Process") mock_q = mocker.patch("scrapyscript.Queue") job = Job(TitleSpider, url="http://www.python.org") print(Processor().run(job)) mock_proc().start.assert_called_once() mock_proc().join.assert_called_once() mock_proc().terminate.assert_called_once()
def get_product_info(term): candleJob = Job(CandleSpider, url="https://www.yankeecandle.com/search?Ntt=" + quote(term)) processor = Processor(settings=None) results = processor.run([candleJob]) if len(results) == 0: return None else: return results[0]
def scrape(self, query): # Create jobs for each instance. *args and **kwargs supplied here will # be passed to the spider constructor at runtime myJob = Job(ClassCentralSpider, keys=query) # Create a Processor, optionally passing in a Scrapy Settings object. processor = Processor(settings=None) # Start the reactor, and block until all spiders complete. data = processor.run([myJob])
def crawl_lad_scrapyscript(depth=lad_depth, urls=None, domain=lad_domain): """Version of crawl_lad that assures multiple run on one worker without restart. """ settings = scrapy_settings(depth, concurrent_requests) if urls is None: urls = list(get_gov_websites(gov_sites_path)) job = Job(LadSpider, urls, domain, depth) processor = Processor(settings=settings) data = processor.run([job]) print(json.dumps(data, indent=4))
def scrap_btls_scoreboard(): settings = Settings() settings.set("USER_AGENT", "Jesse McClure (+http://jrmcclure.github.io)") settings.set("BOT_NAME", "btls_test") settings.set("ROBOTSTXT_OBEY", False) settings.set("ITEM_PIPELINES", { 'btls_test.btls_test.pipelines.BtlsTestPipeline': 1000, }) job = Job(TestSpider()) Processor(settings).run(job) return 'Return from scrapy {}'.format(10)
def test_crawl_calls_crawlerprocess_with_correct_params(mocker): mock_crawl = mocker.patch("scrapyscript.CrawlerProcess") mock_crawl().crawl.return_value = None mock_crawl().start.return_value = None mock_crawl().stop.return_value = None url = "http://www.python.org" job = Job(TitleSpider, url=url) Processor()._crawl([job]) mock_crawl().crawl.assert_called_with(job.spider, url=url) mock_crawl().start.assert_called_once() mock_crawl().stop.assert_called_once()
def spider_results(): # Define a Scrapy Spider, which can accept *args or **kwargs # https://doc.scrapy.org/en/latest/topics/spiders.html#spider-arguments class PythonSpider(Spider): name = 'myspider' def start_requests(self): yield Request(self.url) def parse(self, response): #title = response.xpath('//title/text()').extract() precio_meta = response.xpath( '//*[@id="root-app"]/div/div[3]/div/div[2]/div[1]/div/div[3]/div/div[1]/div/span/span[2]/text()' ).extract() return {'url': response.request.url, 'precio': precio_meta} # Create jobs for each instance. *args and **kwargs supplied here will # be passed to the spider constructor at runtime githubJob = Job( PythonSpider, url= 'https://articulo.mercadolibre.com.ar/MLA-850664638-cuadernos-anotador-2020-modelos-de-diseno-_JM#position=1&type=item&tracking_id=cb49fd5e-5e5d-4e33-903b-66f14e0f3ac5' ) # pythonJob = Job(PythonSpider, url='http://www.python.org') # Create a Processor, optionally passing in a Scrapy Settings object. cust_settings = Settings() cust_settings[ 'USER_AGENT'] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36" processor = Processor(settings=cust_settings) # Start the reactor, and block until all spiders complete. data = processor.run([githubJob]) # Print the consolidated results # print(json.dumps(data, indent=4)) return json.dumps(data, indent=4)
def run_crawler(url, numofpages): newsJob = Job(CrawlerSpider, url=url, numofpages=numofpages) processor = Processor(get_project_settings()) processor.run([newsJob]) return None
def test_for_deadlock(): jobs = [Job(TitleSpider, url="http://www.python.org") for i in range(50)] results = Processor().run(jobs) assert len(results) == 50
def test_big_return_value(): job = Job(BigSpider, url="http://www.python.org") results = Processor().run(job) assert len(results) == 1
def test_bad_return_value(): job = Job(BadSpider) results = Processor().run(job) assert results == []
def test_args_kwargs_passed_to_spider(): spider = ParamReturnSpider job = Job(spider, "cat1", fruit="banana") result = Processor().run(job) assert result == [dict(category="cat1", fruit="banana")]
def test_create_valid_job(self): spider = MySpider job = Job(spider) self.assertIsInstance(job, Job)
def test_parameters_passed_to_spider(self): spider = ParamReturnSpider job = Job(spider, 'cat1', fruit='banana') result = Processor().run(job) self.assertEqual(result, [dict(category='cat1', fruit='banana')])
def test_job_validate_raises_exception_if_not_jobs(): with pytest.raises(ScrapyScriptException): jobs = [Job(TitleSpider, url="http://www.python.org"), "not a Job"] p = Processor() p.validate(jobs)
def celery_job_with_custom_settings(url, settings): job = Job(ItemSpider, url=url) return Processor(settings=settings).run(job)
def test_bad_return_value(self): job = Job(BadSpider, url='http://www.python.org') results = Processor().run(job) self.assertEqual(results, [])
def test_create_valid_job(): spider = TitleSpider job = Job(spider) assert isinstance(job, Job)
def work(url, g_id): broker_job = Job(BrokenImageChecker, url=url, g_id=g_id) processor = Processor(settings=None) result = processor.run([broker_job]) return result
from scrapyscript import Job, Processor from scrapy.utils.project import get_project_settings from web_site_info.spiders.site_info import SiteInfoSpider if __name__ == "__main__": start_urls = open(0).read().splitlines() jobs = list() for url in start_urls: job = Job(SiteInfoSpider, url=url) jobs.append(job) processor = Processor(get_project_settings()) data = processor.run(jobs) for item in data: print(item['_values'])
def test_job_raises_if_no_spider_provided(): with pytest.raises(TypeError): Job()
def celery_job(url): job = Job(TitleSpider, url=url) return Processor().run(job)