def auto_generate(sampleurl,data,common_url): s = Scraper() s.train(sampleurl, data) res = (s.scrape(common_url))[0] for k,v in res.items(): res[k] = v[0].replace('\n', '').strip() return res
def test_train_store_load_scrape(self): url1 = "http://www.icone.co.uk/lighting-suspension/copper-shade-by-tom-dixon/tom-dixon/tom-dixon/MSS45UKC/" data = {"name": "Copper Shade by Tom Dixon", "designer": "Tom Dixon", "price": "320"} s = Scraper() s.train(url1, data, encoding="latin1") f = StringIO() s.tofile(f) f.seek(0) s = Scraper.fromfile(f) url2 = "http://www.icone.co.uk/lighting-wall-and-ceiling/mesmeri-halo-chrome/artemide/eric-sole/0916024A/" data = s.scrape(url2, encoding="latin1") self.assertEqual(sorted(data[0].keys()), ["designer", "name", "price"])
from scrapely import Scraper import sys import json try: scrape_site = sys.argv[1] except: print 'Invalid arguements. Usage python scrape.py <site-name>' sys.exit(2) print 'Training the scraper with existing data-set' s = Scraper() result = {} train_data = json.loads(open(scrape_site + '_train.json', 'r').read()) for data in train_data: s.train(data['url'], {'name': data['title']}) test_data = json.loads(open(scrape_site + '_tests.json', 'r').read()) for data in test_data: result.update(s.scrape(data['url'])) open(scrape_site + '_result.json', 'w').write(json.dumps(result))
""" @author: 挖掘机小王子 @contact: [email protected] @software: PyCharm @file: test.py @time: 2019/12/6 11:53 @desc: """ from scrapely import Scraper import requests scraper = Scraper() url = 'https://www.ituring.com.cn/article' data = {'name': 'duxingzhe', 'title': '这两天的面试经验总结'} # response = requests.get(url).text scraper.train(url, data) result = scraper.scrape(url, encoding='utf-8') print(result)
if obj <> "k.A": graph.add((busRes, BUS[key], Literal(obj))) return graph.serialize(format='turtle') busScraper = Scraper() busScraper.train(busUrlFormat % '1120301', exampleData) offset = 0 while True: html = scraperwiki.scrape(catalogUrlFormat % offset) root = lxml.html.fromstring(html) busIds = root.cssselect('input[type=checkbox]') if len(busIds) > 0: for busCheckbox in busIds: busUrl = busUrlFormat % busCheckbox.attrib['value'] busGraph = CreateGraph(busCheckbox.attrib['value'], busScraper.scrape(busUrl)) dataStored = {'url': busUrl, 'graph': busGraph} scraperwiki.sqlite.save(unique_keys=['url'], data=dataStored) offset += 20 else: breakimport scraperwiki import lxml.html from scrapely import Scraper from rdflib import RDF from rdflib.graph import Graph from rdflib import Literal, BNode, Namespace catalogUrlFormat ='http://www.omnibusrevue.de/buskatalog-578829.html?skip=%d' busUrlFormat = 'http://www.omnibusrevue.de/bus-%s.html' busUrlFormatWithName = 'http://www.omnibusrevue.de/{0}-{1}.html'
from scrapely import Scraper s = Scraper() url1 = 'http://movie.douban.com/subject/1292063/' data1 = {'name': u'美丽人生 La vita è bella', 'author': u'罗伯托·贝尼尼', 'time': '1997-12-20'} s.train(url1, data1) url2 = 'http://movie.douban.com/subject/1291560/' # s.scrape(url2) data2 = {'name': u'龙猫 となりのトトロ', 'author': u'宫崎骏', 'time': '1988-04-16'} s.train(url2, data2) url3 = 'http://movie.douban.com/subject/1293839/' data3 = {'name': u'罗马假日 Roman Holiday', 'author': u'威廉·惠勒', 'time': '1953-08-27'} # s.scrape(url3) s.train(url3, data3) url4 = 'http://movie.douban.com/subject/1292224/' s.scrape(url4) from scrapely import Scraper s = Scraper() url1 = 'http://movie.douban.com/subject/1292063/' data1 = {'name': u'美丽人生 La vita è bella', 'author': u'罗伯托·贝尼尼', 'time': '1997-12-20'} s.train(url1, data1) url4 = 'http://movie.douban.com/subject/1292224/' s.scrape(url4) # with open('11.txt','wb') as afile: # s.tofile(afile)
def create_data(self): training_url = "http://www.wholesalegaming.biz/startrek/trekalphastarterbox/" data_training = {"product": "Star Trek Alpha Unlimited Starter Box", "price": "$15.00"} #train scrapely scraper = Scraper() scraper.train(training_url, data_training) #get the URLs to check page_json = file("pages_to_check.json").read() #format (all strings in unicode) : {"urls" : [ <url1 string>, <url2 string>, ... , <urln string> ] } urls_to_check = json.loads(page_json) #get data #dictionary with "product name": "price" price_list = {} for each_url in urls_to_check["urls"]: scraped_data = scraper.scrape(each_url) #example of a scraped data: [{u'price': [u' $15.00 '], u'product': [u'Star Trek Alpha Unlimited Starter Box']}] #let's sanitize the price to a float and make this a dictionary entry dollar_string = scraped_data[0]["price"][0].replace(" ","") removed_dollar_sign = dollar_string.replace("$", "") try: price_as_float = float(removed_dollar_sign) except ValueError: #If the value gotten isn't convertable to a float, then it #most likely is "Product Unavailable" and we need to deal #with this case later down. N/A will be our tell for that. price_as_float = "N/A" #get the product name by itself. product_name = scraped_data[0]["product"][0] #now add the sanitized price and product name to price list price_list[product_name] = [price_as_float, each_url] #Create a json file of the prices timestamp = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) with open("/tmp/prices-%s.json" % timestamp, "w") as fp: json.dump(price_list, fp, sort_keys=True, indent=4) #Compare this price list to the most "recent" price list recent_price_list = {} with open('/tmp/prices-recent.json', 'r') as fp: recent_price_list = json.load(fp) #This will be the output data of comparing the old data and new data #format: { # "product_one_name": # { # "old_price": <float> # "new_price": <float>, # "new_difference": <float of new price - old price>, # "is_difference": <boolean>, # "is_new_product": <boolean>, # "is_discontinued_product": <boolean> # }, # "product_two_name":... # comparison_data = {} for old_product, old_price in recent_price_list.iteritems(): new_difference = 0.0 is_difference = False is_new_product = False is_discontinued_product = False try: new_price = price_list[old_product] new_difference = new_price[0] - old_price[0] except(KeyError, TypeError): #take care of the case that old_product doesn't appear on price_list #This also takes care of the case the the old_price isn't a float because #the old price is marked as N/A new_price = [0.0] is_discontinued_product = True if new_difference != 0.0: is_difference = True comparison_data[old_product] = { "old_price": old_price[0], "new_price": new_price[0], "new_difference": new_difference, "is_difference": is_difference, "is_new_product": False, "is_discontinued_product": is_discontinued_product, "product_url": old_price[1] } #find all items on price_list that is not in recent_price_list new_inventory_set = set(price_list.keys()) - set(recent_price_list.keys()) new_inventory_list = list(new_inventory_set) for each_product in new_inventory_list: comparison_data[each_product] = { "old_price": 0.0, "new_price": price_list[each_product][0], "new_difference": price_list[each_product][0], "is_difference": True, "is_new_product": True, "is_discontinued_product": False, "product_url": price_list[each_product][1] } #makes it easy to find the always most recent data with open("/tmp/price-comparison-recent.json", "w") as fp: json.dump(comparison_data, fp, sort_keys=True, indent=4) #update the recent prices with open("/tmp/prices-recent.json", "w") as fp: json.dump(price_list, fp, sort_keys=True, indent=4) #Create a file to be the most recent comparison data timestamp = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) if "True" in comparison_data: filename = "/tmp/price-comparison-%s.json" with open(filename, "w") as fp: json.dump(comparison_data, fp, sort_keys=True, indent=4) return filename return None
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2020/3/3 0:27 # @Author : tanxw # pip install scrapely from scrapely import Scraper s = Scraper() train_url = 'http://example.webscraping.com/places/default/view/Afghanistan-1' s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'}) test_url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239' s.scrape(test_url)
from scrapely import Scraper s = Scraper() url = "" data = {} s.scrape(url, data)
from scrapely import Scraper import sys import json try: scrape_site=sys.argv[1] except: print 'Invalid arguements. Usage python scrape.py <site-name>' sys.exit(2) print 'Training the scraper with existing data-set' s=Scraper() result={} train_data=json.loads(open(scrape_site+'_train.json','r').read()) for data in train_data: s.train( data['url'],{'name':data['title']}) test_data=json.loads(open(scrape_site+'_tests.json','r').read()) for data in test_data: result.update(s.scrape(data['url'])) open(scrape_site+'_result.json','w').write(json.dumps(result))
def scrapely_test(): s = Scraper() train_url = 'http://example.python-scraping.com/view/Afghanistan-1' s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'}) test_url = 'http://example.python-scraping.com/view/United-Kingdom-239' print(s.scrape(test_url))
'text': '<div id="article">', 'author': 'Redazione Journey', 'date': '22 mar 2017' } s.train(url1, data) # file opener file_wht = open('test.csv', "wb") writer = csv.writer(file_wht, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerow(("Titolo", "Testo", "Autore", "Data")) # get stuff for item in urls: try: content = s.scrape(item)[0] title = h.handle(content["title"][0]).encode('utf-8') parsed_text = h.handle(content["text"][0]).encode('utf-8') author = h.handle(content["author"][0]).encode('utf-8') date = h.handle(content["date"][0]).encode('utf-8') print "Success!" tpl = (title, parsed_text, author, date) writer.writerow(tpl) except: print ":(" file_wht.close()
##pp = pprint.Prettyprint(indent=2) #pprint.pprint(d) #print d[0]['title'][0] #print d[0]['category'][0] #print d[0]['date'][0] s.train(url1, data) import os data = {} #for dirname, dirnames, filenames in os.walk('../utf8/'): # for filename in filenames: for fn in os.listdir('../utf8/'): print fn url2 = '../utf8/' + fn d = s.scrape(url2) try: data[fn] = { 'title': unicode(d[0]['title'][0]), 'category': unicode(d[0]['category'][0]), 'date': unicode(d[0]['date'][0]) } except Exception as e: print e open('data.pickle', 'w').write(pickle.dumps(data))
import urllib, scraperwiki from scrapely import Scraper s = Scraper() # note how we're *not* using Scraper() - this uses our custom version url1 = 'http://www.thefest.com/store/beatles-ornaments/the-beatles-applique-stocking-p-3901' data = {'name': 'THE BEATLES APPLIQUE STOCKING', 'category': 'Beatles Ornaments', 'description': 'BRAND NEW- If you are good, maybe Santa will put something special in this poly/cotton applique stocking - He will have to work overtime to fill this! Measures 19" diagonally from upper left facing to the tip of the toe. This is the first Christmas Beatles Stocking ever offered!', 'price': '$20.00', 'catalog number': '7287'} s.train(url1,data) url2 = 'http://www.thefest.com/store/beatles-ornaments/yellow-submarines-light-set-p-3876' print s.scrape(url2)import urllib, scraperwiki from scrapely import Scraper s = Scraper() # note how we're *not* using Scraper() - this uses our custom version url1 = 'http://www.thefest.com/store/beatles-ornaments/the-beatles-applique-stocking-p-3901' data = {'name': 'THE BEATLES APPLIQUE STOCKING', 'category': 'Beatles Ornaments', 'description': 'BRAND NEW- If you are good, maybe Santa will put something special in this poly/cotton applique stocking - He will have to work overtime to fill this! Measures 19" diagonally from upper left facing to the tip of the toe. This is the first Christmas Beatles Stocking ever offered!', 'price': '$20.00', 'catalog number': '7287'} s.train(url1,data) url2 = 'http://www.thefest.com/store/beatles-ornaments/yellow-submarines-light-set-p-3876' print s.scrape(url2)