def test_train_store_load_scrape(self): url1 = "http://www.icone.co.uk/lighting-suspension/copper-shade-by-tom-dixon/tom-dixon/tom-dixon/MSS45UKC/" data = {"name": "Copper Shade by Tom Dixon", "designer": "Tom Dixon", "price": "320"} s = Scraper() s.train(url1, data, encoding="latin1") f = StringIO() s.tofile(f) f.seek(0) s = Scraper.fromfile(f) url2 = "http://www.icone.co.uk/lighting-wall-and-ceiling/mesmeri-halo-chrome/artemide/eric-sole/0916024A/" data = s.scrape(url2, encoding="latin1") self.assertEqual(sorted(data[0].keys()), ["designer", "name", "price"])
def test_extraction(self): samples_encoding = 'latin1' [(html1, data1), (html2, data2)] = list(iter_samples( 'scraper_loadstore', html_encoding=samples_encoding)) sc = Scraper() page1 = HtmlPage(body=html1, encoding=samples_encoding) sc.train_from_htmlpage(page1, data1) page2 = HtmlPage(body=html2, encoding=samples_encoding) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2) # check still works after serialize/deserialize f = StringIO() sc.tofile(f) f.seek(0) sc = Scraper.fromfile(f) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2)
def update_scrapers_file(url): domain = re.search(r'(?<=\/\/)[\w\.-]+(?=\/)', url).group() scraper_file_name = "" scrapers_json = {} with open('scrapers.json', 'r') as scrapers_file: scrapers_json = json.load(scrapers_file) scraper_file_name = domain + ".json" scrapers_json[domain] = scraper_file_name with open('scrapers.json', 'w') as scrapers_file: json.dump(scrapers_json, scrapers_file) return scraper_file_name # TODO add help and verbose modes # TODO add arg validation and error feedback scraper = Scraper() training_params = open_training_file() assert training_params, "no training parameters found in {}".format( sys.argv[1]) url = training_params['url'] params = training_params['params'] scraper.train(url, params) # TODO replace this with database action and maybe do checksum compare to avoid writing same scraper more than once? scraper_file_name = update_scrapers_file(url) with open(scraper_file_name, 'w') as scraper_file: scraper.tofile(scraper_file)