def get_scraper( url: str, scrapers_file_name: str = 'scrapers.json') -> Union[None, Scraper]: domain = get_domain(url) with open(get_file_path(scrapers_file_name), 'r') as scrapers_file: scrapers_json = json.load(scrapers_file) if domain in scrapers_json: scraper_file_name = scrapers_json[domain] else: return None with open(get_file_path(scraper_file_name), 'r') as scraper_file: return Scraper.fromfile(scraper_file)
def test_train_store_load_scrape(self): url1 = "http://www.icone.co.uk/lighting-suspension/copper-shade-by-tom-dixon/tom-dixon/tom-dixon/MSS45UKC/" data = {"name": "Copper Shade by Tom Dixon", "designer": "Tom Dixon", "price": "320"} s = Scraper() s.train(url1, data, encoding="latin1") f = StringIO() s.tofile(f) f.seek(0) s = Scraper.fromfile(f) url2 = "http://www.icone.co.uk/lighting-wall-and-ceiling/mesmeri-halo-chrome/artemide/eric-sole/0916024A/" data = s.scrape(url2, encoding="latin1") self.assertEqual(sorted(data[0].keys()), ["designer", "name", "price"])
def test_extraction(self): samples_encoding = 'latin1' [(html1, data1), (html2, data2)] = list(iter_samples( 'scraper_loadstore', html_encoding=samples_encoding)) sc = Scraper() page1 = HtmlPage(body=html1, encoding=samples_encoding) sc.train_from_htmlpage(page1, data1) page2 = HtmlPage(body=html2, encoding=samples_encoding) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2) # check still works after serialize/deserialize f = StringIO() sc.tofile(f) f.seek(0) sc = Scraper.fromfile(f) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2)