Esempio n. 1
0
 def scrape(self, ind):
     db = FirebaseAccess()
     urls1 = [
         'https://www.class-central.com/subject/cs',
         'https://www.class-central.com/subject/business',
         'https://www.class-central.com/subject/science',
         'https://www.class-central.com/subject/data-science',
         'https://www.class-central.com/subject/programming-and-software-development',
         'https://www.class-central.com/subject/engineering',
         'https://www.class-central.com/subject/maths'
     ]
     urls2 = [
         'https://www.class-central.com/subject/humanities',
         'https://www.class-central.com/subject/social-sciences',
         'https://www.class-central.com/subject/education',
         'https://www.class-central.com/subject/personal-development',
         'https://www.class-central.com/subject/art-and-design',
         'https://www.class-central.com/subject/health'
     ]
     if ind == 1:
         sched = Job(CoursesSpider, fbadb=db, urls_to_scrape=urls1)
     else:
         sched = Job(CoursesSpider, fbadb=db, urls_to_scrape=urls2)
     processor = Processor(settings=None)
     data = processor.run([sched])
Esempio n. 2
0
def get_product_info(term):
    candleJob = Job(CandleSpider,
                    url="https://www.yankeecandle.com/search?Ntt=" +
                    quote(term))
    processor = Processor(settings=None)
    results = processor.run([candleJob])
    if len(results) == 0:
        return None
    else:
        return results[0]
Esempio n. 3
0
    def scrape(self, query):
        # Create jobs for each instance. *args and **kwargs supplied here will
        # be passed to the spider constructor at runtime
        myJob = Job(ClassCentralSpider, keys=query)

        # Create a Processor, optionally passing in a Scrapy Settings object.
        processor = Processor(settings=None)

        # Start the reactor, and block until all spiders complete.
        data = processor.run([myJob])
Esempio n. 4
0
def crawl_lad_scrapyscript(depth=lad_depth, urls=None, domain=lad_domain):
    """Version of crawl_lad that assures multiple run on one worker without restart.    """
    settings = scrapy_settings(depth, concurrent_requests)

    if urls is None:
        urls = list(get_gov_websites(gov_sites_path))

    job = Job(LadSpider, urls, domain, depth)
    processor = Processor(settings=settings)
    data = processor.run([job])
    print(json.dumps(data, indent=4))
Esempio n. 5
0
    def test_settings_flow_through_to_spider(self):
        settings = Settings()
        settings['BOT_NAME'] = 'alpha'
        job = Job(MySpider, url='http://www.python.org')
        results = Processor(settings=settings).run(job)

        self.assertIn({'bot': 'alpha'}, results)
Esempio n. 6
0
    def test_settings_flow_through_to_spider(self):
        settings = Settings()
        settings['BOT_NAME'] = 'alpha'
        job = Job(TestSpider())
        results = Processor(settings=settings).run(job)

        self.assertIn({'bot': 'alpha'}, results)
Esempio n. 7
0
def test_settings_flow_through_to_spider():
    settings = Settings()
    settings["BOT_NAME"] = "alpha"
    job = Job(ItemSpider, url="http://www.python.org")
    results = Processor(settings=settings).run(job)

    assert results[0]["bot"] == "alpha"
Esempio n. 8
0
    def test_mulitple_jobs(self):
        jobs = [
            Job.from_xpath('http://www.python.org', '//title/text()'),
            Job.from_xpath('http://www.python.org', '//title/text()'),
        ]

        results = Processor().run(jobs)
        self.assertEqual(len(results), 2)
Esempio n. 9
0
    def test_mulitple_jobs(self):
        jobs = [
            Job(MySpider, url='http://www.python.org'),
            Job(MySpider, url='http://www.github.com')
        ]

        results = Processor().run(jobs)
        self.assertEqual(len(results), 4)
Esempio n. 10
0
def test_run_calls_process_join_terminate(mocker):
    mock_proc = mocker.patch("scrapyscript.Process")
    mock_q = mocker.patch("scrapyscript.Queue")

    job = Job(TitleSpider, url="http://www.python.org")
    print(Processor().run(job))
    mock_proc().start.assert_called_once()
    mock_proc().join.assert_called_once()
    mock_proc().terminate.assert_called_once()
Esempio n. 11
0
def scrap_btls_scoreboard():
    settings = Settings()
    settings.set("USER_AGENT", "Jesse McClure (+http://jrmcclure.github.io)")
    settings.set("BOT_NAME", "btls_test")
    settings.set("ROBOTSTXT_OBEY", False)
    settings.set("ITEM_PIPELINES", {
        'btls_test.btls_test.pipelines.BtlsTestPipeline': 1000,
    })
    job = Job(TestSpider())
    Processor(settings).run(job)
    return 'Return from scrapy {}'.format(10)
Esempio n. 12
0
def test_multiple_jobs_return_job_specific_data_in_each_result():
    jobs = [
        Job(TitleSpider, url="http://www.python.org"),
        Job(TitleSpider, url="http://www.github.com"),
    ]

    results = Processor().run(jobs)
    data = [item["data"].lower() for item in results]
    assert any("python" in s for s in data)
    assert any("github" in s for s in data)
    assert len(results) == 2
Esempio n. 13
0
def test_crawl_calls_crawlerprocess_with_correct_params(mocker):
    mock_crawl = mocker.patch("scrapyscript.CrawlerProcess")
    mock_crawl().crawl.return_value = None
    mock_crawl().start.return_value = None
    mock_crawl().stop.return_value = None

    url = "http://www.python.org"
    job = Job(TitleSpider, url=url)
    Processor()._crawl([job])

    mock_crawl().crawl.assert_called_with(job.spider, url=url)
    mock_crawl().start.assert_called_once()
    mock_crawl().stop.assert_called_once()
Esempio n. 14
0
def spider_results():

    # Define a Scrapy Spider, which can accept *args or **kwargs
    # https://doc.scrapy.org/en/latest/topics/spiders.html#spider-arguments
    class PythonSpider(Spider):
        name = 'myspider'

        def start_requests(self):
            yield Request(self.url)

        def parse(self, response):
            #title = response.xpath('//title/text()').extract()
            precio_meta = response.xpath(
                '//*[@id="root-app"]/div/div[3]/div/div[2]/div[1]/div/div[3]/div/div[1]/div/span/span[2]/text()'
            ).extract()
            return {'url': response.request.url, 'precio': precio_meta}

    # Create jobs for each instance. *args and **kwargs supplied here will
    # be passed to the spider constructor at runtime
    githubJob = Job(
        PythonSpider,
        url=
        'https://articulo.mercadolibre.com.ar/MLA-850664638-cuadernos-anotador-2020-modelos-de-diseno-_JM#position=1&type=item&tracking_id=cb49fd5e-5e5d-4e33-903b-66f14e0f3ac5'
    )
    # pythonJob = Job(PythonSpider, url='http://www.python.org')

    # Create a Processor, optionally passing in a Scrapy Settings object.
    cust_settings = Settings()
    cust_settings[
        'USER_AGENT'] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
    processor = Processor(settings=cust_settings)

    # Start the reactor, and block until all spiders complete.
    data = processor.run([githubJob])

    # Print the consolidated results
    # print(json.dumps(data, indent=4))
    return json.dumps(data, indent=4)
Esempio n. 15
0
def run_crawler(url, numofpages):
    newsJob = Job(CrawlerSpider, url=url, numofpages=numofpages)
    processor = Processor(get_project_settings())
    processor.run([newsJob])
    return None
Esempio n. 16
0
def test_for_deadlock():
    jobs = [Job(TitleSpider, url="http://www.python.org") for i in range(50)]

    results = Processor().run(jobs)
    assert len(results) == 50
Esempio n. 17
0
def test_item_scraped_appends_items():
    p = Processor()
    p._item_scraped("test")
    assert p.items[0] == "test"
Esempio n. 18
0
def test_big_return_value():
    job = Job(BigSpider, url="http://www.python.org")
    results = Processor().run(job)
    assert len(results) == 1
Esempio n. 19
0
def test_bad_return_value():
    job = Job(BadSpider)
    results = Processor().run(job)
    assert results == []
Esempio n. 20
0
def check_registration(voter_data=None):
    job = Job(VoterRegistrationSpider(), payload=voter_data)
    response = Processor(get_project_settings()).run(job)

    return response
Esempio n. 21
0
def test_args_kwargs_passed_to_spider():
    spider = ParamReturnSpider
    job = Job(spider, "cat1", fruit="banana")
    result = Processor().run(job)
    assert result == [dict(category="cat1", fruit="banana")]
Esempio n. 22
0
def test_job_validate_raises_exception_if_not_jobs():
    with pytest.raises(ScrapyScriptException):
        jobs = [Job(TitleSpider, url="http://www.python.org"), "not a Job"]
        p = Processor()
        p.validate(jobs)
Esempio n. 23
0
 def test_job_from_xpath(self):
     job = Job.from_xpath('http://www.python.org', '//title/text()')
     results = Processor().run(job)
     self.assertEqual(results[0]['data'][0].extract(),
                      'Welcome to Python.org')
Esempio n. 24
0
def celery_job_with_custom_settings(url, settings):
    job = Job(ItemSpider, url=url)
    return Processor(settings=settings).run(job)
Esempio n. 25
0
def celery_job(url):
    job = Job(TitleSpider, url=url)
    return Processor().run(job)
Esempio n. 26
0
    def test_payload_flows_through_to_spider(self):
        job = Job(TestSpider(), payload='apples')
        results = Processor().run(job)

        self.assertIn({'payload': 'apples'}, results)
Esempio n. 27
0
 def test_parameters_passed_to_spider(self):
     spider = ParamReturnSpider
     job = Job(spider, 'cat1', fruit='banana')
     result = Processor().run(job)
     self.assertEqual(result, [dict(category='cat1', fruit='banana')])
Esempio n. 28
0
 def test_bad_return_value(self):
     job = Job(BadSpider, url='http://www.python.org')
     results = Processor().run(job)
     self.assertEqual(results, [])
Esempio n. 29
0
 def work(url, g_id):
     broker_job = Job(BrokenImageChecker, url=url, g_id=g_id)
     processor = Processor(settings=None)
     result = processor.run([broker_job])
     return result
Esempio n. 30
0
from scrapyscript import Job, Processor
from scrapy.utils.project import get_project_settings

from web_site_info.spiders.site_info import SiteInfoSpider

if __name__ == "__main__":

    start_urls = open(0).read().splitlines()
    jobs = list()

    for url in start_urls:
        job = Job(SiteInfoSpider, url=url)
        jobs.append(job)

    processor = Processor(get_project_settings())
    data = processor.run(jobs)

    for item in data:
        print(item['_values'])