Ejemplo n.º 1
0
def auto_generate(sampleurl,data,common_url):
    s = Scraper()
    s.train(sampleurl, data)
    res = (s.scrape(common_url))[0]
    for k,v in res.items():
        res[k] = v[0].replace('\n', '').strip()
    return res
Ejemplo n.º 2
0
    def test_train_store_load_scrape(self):
        url1 = "http://www.icone.co.uk/lighting-suspension/copper-shade-by-tom-dixon/tom-dixon/tom-dixon/MSS45UKC/"
        data = {"name": "Copper Shade by Tom Dixon", "designer": "Tom Dixon", "price": "320"}
        s = Scraper()
        s.train(url1, data, encoding="latin1")

        f = StringIO()
        s.tofile(f)

        f.seek(0)
        s = Scraper.fromfile(f)

        url2 = "http://www.icone.co.uk/lighting-wall-and-ceiling/mesmeri-halo-chrome/artemide/eric-sole/0916024A/"
        data = s.scrape(url2, encoding="latin1")
        self.assertEqual(sorted(data[0].keys()), ["designer", "name", "price"])
Ejemplo n.º 3
0
from scrapely import Scraper
import sys
import json
try:
    scrape_site = sys.argv[1]
except:
    print 'Invalid arguements. Usage python scrape.py <site-name>'
    sys.exit(2)
print 'Training the scraper with existing data-set'
s = Scraper()
result = {}
train_data = json.loads(open(scrape_site + '_train.json', 'r').read())
for data in train_data:
    s.train(data['url'], {'name': data['title']})
test_data = json.loads(open(scrape_site + '_tests.json', 'r').read())
for data in test_data:
    result.update(s.scrape(data['url']))
open(scrape_site + '_result.json', 'w').write(json.dumps(result))
Ejemplo n.º 4
0
"""
    @author: 挖掘机小王子
    @contact: [email protected]
    @software: PyCharm
    @file: test.py
    @time: 2019/12/6 11:53
    @desc:
"""
from scrapely import Scraper
import requests


scraper = Scraper()

url = 'https://www.ituring.com.cn/article'
data = {'name': 'duxingzhe', 'title': '这两天的面试经验总结'}
# response = requests.get(url).text
scraper.train(url, data)
result = scraper.scrape(url, encoding='utf-8')
print(result)
Ejemplo n.º 5
0
OR = Namespace("http://purl.org/wikibus/omnibusrevue/bus/")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")

def CreateGraph(busId, busData):
    graph = Graph()
    busRes = OR[busId]
    graph.add((busRes, RDF.type, BUS["Bus"]))
    graph.add((busRes, FOAF["page"], Literal(busUrlFormatWithName.format(busData[0]['model'][0].encode('utf-8'), busId))))
    for key in busData[0]:        
        obj = busData[0][key][0].encode('utf-8')
        if obj <> "k.A":
            graph.add((busRes, BUS[key], Literal(obj)))
    return graph.serialize(format='turtle')

busScraper = Scraper()
busScraper.train(busUrlFormat % '1120301', exampleData)

offset = 0
while True:
    html = scraperwiki.scrape(catalogUrlFormat % offset)
    root = lxml.html.fromstring(html)
    busIds = root.cssselect('input[type=checkbox]')
    if len(busIds) > 0:
        for busCheckbox in busIds:
            busUrl = busUrlFormat % busCheckbox.attrib['value']
            busGraph = CreateGraph(busCheckbox.attrib['value'], busScraper.scrape(busUrl))
            dataStored = {'url': busUrl, 'graph': busGraph}
            scraperwiki.sqlite.save(unique_keys=['url'], data=dataStored)
        offset += 20
    else:
        breakimport scraperwiki           
Ejemplo n.º 6
0
from scrapely import Scraper
s = Scraper()
url1 = 'http://movie.douban.com/subject/1292063/'
data1 = {'name': u'美丽人生 La vita è bella', 'author': u'罗伯托·贝尼尼', 'time': '1997-12-20'}
s.train(url1, data1)

url2 = 'http://movie.douban.com/subject/1291560/'
# s.scrape(url2)
data2 = {'name': u'龙猫 となりのトトロ', 'author': u'宫崎骏', 'time': '1988-04-16'}
s.train(url2, data2)

url3 = 'http://movie.douban.com/subject/1293839/'
data3 = {'name': u'罗马假日 Roman Holiday', 'author': u'威廉·惠勒', 'time': '1953-08-27'}
# s.scrape(url3)
s.train(url3, data3)

url4 = 'http://movie.douban.com/subject/1292224/'
s.scrape(url4)


from scrapely import Scraper
s = Scraper()
url1 = 'http://movie.douban.com/subject/1292063/'
data1 = {'name': u'美丽人生 La vita è bella', 'author': u'罗伯托·贝尼尼', 'time': '1997-12-20'}
s.train(url1, data1)

url4 = 'http://movie.douban.com/subject/1292224/'
s.scrape(url4)
# with open('11.txt','wb') as afile:
# 	s.tofile(afile)
Ejemplo n.º 7
0
    def create_data(self):
        training_url = "http://www.wholesalegaming.biz/startrek/trekalphastarterbox/"
        data_training = {"product": "Star Trek Alpha Unlimited Starter Box", "price": "$15.00"}

        #train scrapely
        scraper = Scraper()

        scraper.train(training_url, data_training)

        #get the URLs to check

        page_json = file("pages_to_check.json").read()

        #format (all strings in unicode) : {"urls" : [ <url1 string>, <url2 string>, ... , <urln string> ] }
        urls_to_check = json.loads(page_json)

        #get data

        #dictionary with "product name": "price"
        price_list = {}

        for each_url in urls_to_check["urls"]:
            scraped_data = scraper.scrape(each_url)
            #example of a scraped data: [{u'price': [u'&nbsp;$15.00&nbsp;'], u'product': [u'Star Trek Alpha Unlimited Starter Box']}]

            #let's sanitize the price to a float and make this a dictionary entry
            dollar_string = scraped_data[0]["price"][0].replace("&nbsp;","")
            removed_dollar_sign = dollar_string.replace("$", "")
            try:
                price_as_float = float(removed_dollar_sign)
            except ValueError:
                #If the value gotten isn't convertable to a float, then it
                #most likely is "Product Unavailable" and we need to deal
                #with this case later down.  N/A will be our tell for that.
                price_as_float = "N/A"
            #get the product name by itself.
            product_name = scraped_data[0]["product"][0]

            #now add the sanitized price and product name to price list
            price_list[product_name] = [price_as_float, each_url]

        #Create a json file of the prices
        timestamp = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
        with open("/tmp/prices-%s.json" % timestamp, "w") as fp:
            json.dump(price_list, fp, sort_keys=True, indent=4)

        #Compare this price list to the most "recent" price list
        recent_price_list = {}

        with open('/tmp/prices-recent.json', 'r') as fp:
            recent_price_list = json.load(fp)

        #This will be the output data of comparing the old data and new data
        #format: {
        #            "product_one_name":
        #                {
        #                     "old_price": <float>
        #                     "new_price": <float>,
        #                     "new_difference": <float of new price - old price>,
        #                     "is_difference": <boolean>,
        #                     "is_new_product": <boolean>,
        #                     "is_discontinued_product": <boolean>
        #                },
        #            "product_two_name":...
        #
        comparison_data = {}

        for old_product, old_price in recent_price_list.iteritems():
            new_difference = 0.0
            is_difference = False
            is_new_product = False
            is_discontinued_product = False
            try:
                new_price = price_list[old_product]
                new_difference = new_price[0] - old_price[0]
            except(KeyError, TypeError):
                #take care of the case that old_product doesn't appear on price_list
                #This also takes care of the case the the old_price isn't a float because
                #the old price is marked as N/A
                new_price = [0.0]
                is_discontinued_product = True

            if new_difference != 0.0:
                is_difference = True

            comparison_data[old_product] = {
                                            "old_price": old_price[0],
                                            "new_price": new_price[0],
                                            "new_difference": new_difference,
                                            "is_difference": is_difference,
                                            "is_new_product": False,
                                            "is_discontinued_product": is_discontinued_product,
                                            "product_url": old_price[1]
                                        }

        #find all items on price_list that is not in recent_price_list
        new_inventory_set = set(price_list.keys()) - set(recent_price_list.keys())
        new_inventory_list = list(new_inventory_set)

        for each_product in new_inventory_list:
            comparison_data[each_product] = { "old_price": 0.0,
                                              "new_price": price_list[each_product][0],
                                              "new_difference": price_list[each_product][0],
                                              "is_difference": True,
                                              "is_new_product": True,
                                              "is_discontinued_product": False,
                                              "product_url": price_list[each_product][1]
                                        }

        #makes it easy to find the always most recent data
        with open("/tmp/price-comparison-recent.json", "w") as fp:
            json.dump(comparison_data, fp, sort_keys=True, indent=4)

        #update the recent prices
        with open("/tmp/prices-recent.json", "w") as fp:
            json.dump(price_list, fp, sort_keys=True, indent=4)

        #Create a file to be the most recent comparison data
        timestamp = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
        if "True" in comparison_data:
            filename = "/tmp/price-comparison-%s.json"
            with open(filename, "w") as fp:
                json.dump(comparison_data, fp, sort_keys=True, indent=4)
                return filename

        return None
Ejemplo n.º 8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/3/3 0:27
# @Author  : tanxw

# pip install scrapely
from scrapely import Scraper
s = Scraper()
train_url = 'http://example.webscraping.com/places/default/view/Afghanistan-1'
s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'})
test_url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
s.scrape(test_url)
Ejemplo n.º 9
0
from scrapely import Scraper
import sys
import json
try:
	scrape_site=sys.argv[1]
except:
	print 'Invalid arguements. Usage python scrape.py <site-name>'
	sys.exit(2)
print 'Training the scraper with existing data-set'
s=Scraper()
result={}
train_data=json.loads(open(scrape_site+'_train.json','r').read())
for data in train_data:
	s.train( data['url'],{'name':data['title']})
test_data=json.loads(open(scrape_site+'_tests.json','r').read())
for data in test_data:
	result.update(s.scrape(data['url']))
open(scrape_site+'_result.json','w').write(json.dumps(result))

Ejemplo n.º 10
0
from scrapely import Scraper

s = Scraper()

url = ""
data = {}
s.train(url, data)
Ejemplo n.º 11
0
def scrapely_test():
    s = Scraper()
    train_url = 'http://example.python-scraping.com/view/Afghanistan-1'
    s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'})
    test_url = 'http://example.python-scraping.com/view/United-Kingdom-239'
    print(s.scrape(test_url))
Ejemplo n.º 12
0
def update_scrapers_file(url):
    domain = re.search(r'(?<=\/\/)[\w\.-]+(?=\/)', url).group()
    scraper_file_name = ""
    scrapers_json = {}
    with open('scrapers.json', 'r') as scrapers_file:
        scrapers_json = json.load(scrapers_file)

    scraper_file_name = domain + ".json"
    scrapers_json[domain] = scraper_file_name
    with open('scrapers.json', 'w') as scrapers_file:
        json.dump(scrapers_json, scrapers_file)

    return scraper_file_name


# TODO add help and verbose modes
# TODO add arg validation and error feedback
scraper = Scraper()
training_params = open_training_file()
assert training_params, "no training parameters found in {}".format(
    sys.argv[1])
url = training_params['url']
params = training_params['params']
scraper.train(url, params)
# TODO replace this with database action and maybe do checksum compare to avoid writing same scraper more than once?
scraper_file_name = update_scrapers_file(url)

with open(scraper_file_name, 'w') as scraper_file:
    scraper.tofile(scraper_file)
from scrapely import Scraper
from urls import urls

h = html2text.HTML2Text()
s = Scraper()

# train
url1 = 'http://www.coca-colaitalia.it/storie/il-primo-ingrediente-dei-nostri-prodotti-e-lacqua'
data = {
    'title':
    'Il primo ingrediente dei nostri prodotti è l’acqua. Ecco come lo preserviamo',
    'text': '<div id="article">',
    'author': 'Redazione Journey',
    'date': '22 mar 2017'
}
s.train(url1, data)

# file opener
file_wht = open('test.csv', "wb")
writer = csv.writer(file_wht,
                    delimiter=';',
                    quotechar='"',
                    quoting=csv.QUOTE_ALL)
writer.writerow(("Titolo", "Testo", "Autore", "Data"))

# get stuff
for item in urls:
    try:
        content = s.scrape(item)[0]
        title = h.handle(content["title"][0]).encode('utf-8')
        parsed_text = h.handle(content["text"][0]).encode('utf-8')
Ejemplo n.º 14
0
    'category': '类别:Linux'
        }

if len(sys.argv) > 1:
    url2=sys.argv[1]
else:
    url2='fa2ebd45db2fd724cefca317.html'

#import pprint
##pp = pprint.Prettyprint(indent=2)
#pprint.pprint(d)
#print d[0]['title'][0]
#print d[0]['category'][0]
#print d[0]['date'][0]

s.train(url1, data)

import os

data = {}

#for dirname, dirnames, filenames in os.walk('../utf8/'):
#    for filename in filenames:

for fn in os.listdir('../utf8/'):
    print fn
    url2 = '../utf8/' + fn
    d = s.scrape(url2)
    try:
        data[fn] = {
                'title': unicode(d[0]['title'][0]),