def scrape(self, ind): db = FirebaseAccess() urls1 = [ 'https://www.class-central.com/subject/cs', 'https://www.class-central.com/subject/business', 'https://www.class-central.com/subject/science', 'https://www.class-central.com/subject/data-science', 'https://www.class-central.com/subject/programming-and-software-development', 'https://www.class-central.com/subject/engineering', 'https://www.class-central.com/subject/maths' ] urls2 = [ 'https://www.class-central.com/subject/humanities', 'https://www.class-central.com/subject/social-sciences', 'https://www.class-central.com/subject/education', 'https://www.class-central.com/subject/personal-development', 'https://www.class-central.com/subject/art-and-design', 'https://www.class-central.com/subject/health' ] if ind == 1: sched = Job(CoursesSpider, fbadb=db, urls_to_scrape=urls1) else: sched = Job(CoursesSpider, fbadb=db, urls_to_scrape=urls2) processor = Processor(settings=None) data = processor.run([sched])
def get_product_info(term): candleJob = Job(CandleSpider, url="https://www.yankeecandle.com/search?Ntt=" + quote(term)) processor = Processor(settings=None) results = processor.run([candleJob]) if len(results) == 0: return None else: return results[0]
def scrape(self, query): # Create jobs for each instance. *args and **kwargs supplied here will # be passed to the spider constructor at runtime myJob = Job(ClassCentralSpider, keys=query) # Create a Processor, optionally passing in a Scrapy Settings object. processor = Processor(settings=None) # Start the reactor, and block until all spiders complete. data = processor.run([myJob])
def crawl_lad_scrapyscript(depth=lad_depth, urls=None, domain=lad_domain): """Version of crawl_lad that assures multiple run on one worker without restart. """ settings = scrapy_settings(depth, concurrent_requests) if urls is None: urls = list(get_gov_websites(gov_sites_path)) job = Job(LadSpider, urls, domain, depth) processor = Processor(settings=settings) data = processor.run([job]) print(json.dumps(data, indent=4))
def spider_results(): # Define a Scrapy Spider, which can accept *args or **kwargs # https://doc.scrapy.org/en/latest/topics/spiders.html#spider-arguments class PythonSpider(Spider): name = 'myspider' def start_requests(self): yield Request(self.url) def parse(self, response): #title = response.xpath('//title/text()').extract() precio_meta = response.xpath( '//*[@id="root-app"]/div/div[3]/div/div[2]/div[1]/div/div[3]/div/div[1]/div/span/span[2]/text()' ).extract() return {'url': response.request.url, 'precio': precio_meta} # Create jobs for each instance. *args and **kwargs supplied here will # be passed to the spider constructor at runtime githubJob = Job( PythonSpider, url= 'https://articulo.mercadolibre.com.ar/MLA-850664638-cuadernos-anotador-2020-modelos-de-diseno-_JM#position=1&type=item&tracking_id=cb49fd5e-5e5d-4e33-903b-66f14e0f3ac5' ) # pythonJob = Job(PythonSpider, url='http://www.python.org') # Create a Processor, optionally passing in a Scrapy Settings object. cust_settings = Settings() cust_settings[ 'USER_AGENT'] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36" processor = Processor(settings=cust_settings) # Start the reactor, and block until all spiders complete. data = processor.run([githubJob]) # Print the consolidated results # print(json.dumps(data, indent=4)) return json.dumps(data, indent=4)
def work(url, g_id): broker_job = Job(BrokenImageChecker, url=url, g_id=g_id) processor = Processor(settings=None) result = processor.run([broker_job]) return result
from scrapyscript import Job, Processor from scrapy.utils.project import get_project_settings from web_site_info.spiders.site_info import SiteInfoSpider if __name__ == "__main__": start_urls = open(0).read().splitlines() jobs = list() for url in start_urls: job = Job(SiteInfoSpider, url=url) jobs.append(job) processor = Processor(get_project_settings()) data = processor.run(jobs) for item in data: print(item['_values'])
def run_crawler(url, numofpages): newsJob = Job(CrawlerSpider, url=url, numofpages=numofpages) processor = Processor(get_project_settings()) processor.run([newsJob]) return None
import scrapy from scrapyscript import Job, Processor settings = scrapy.settings.Settings(values={"LOG_LEVEL": "WARNING"}) processor = Processor(settings=None) class PythonSpider(scrapy.spiders.Spider): name = "myspider" def start_requests(self): yield scrapy.Request(self.url) def parse(self, response): data = response.xpath("//title/text()").extract_first() return {"title": data} job = Job(PythonSpider, url="http://www.python.org") results = processor.run(job) print(results)