Python Scraper.scrapeの例、scrapely.Scraper.scrape Pythonの例

コード例 #1

0

ファイルを表示

def auto_generate(sampleurl,data,common_url):
    s = Scraper()
    s.train(sampleurl, data)
    res = (s.scrape(common_url))[0]
    for k,v in res.items():
        res[k] = v[0].replace('\n', '').strip()
    return res

コード例 #2

0

ファイルを表示

ファイル: test_scraper.py プロジェクト: netconstructor/scrapely

    def test_train_store_load_scrape(self):
        url1 = "http://www.icone.co.uk/lighting-suspension/copper-shade-by-tom-dixon/tom-dixon/tom-dixon/MSS45UKC/"
        data = {"name": "Copper Shade by Tom Dixon", "designer": "Tom Dixon", "price": "320"}
        s = Scraper()
        s.train(url1, data, encoding="latin1")

        f = StringIO()
        s.tofile(f)

        f.seek(0)
        s = Scraper.fromfile(f)

        url2 = "http://www.icone.co.uk/lighting-wall-and-ceiling/mesmeri-halo-chrome/artemide/eric-sole/0916024A/"
        data = s.scrape(url2, encoding="latin1")
        self.assertEqual(sorted(data[0].keys()), ["designer", "name", "price"])

コード例 #3

0

ファイルを表示

from scrapely import Scraper
import sys
import json
try:
    scrape_site = sys.argv[1]
except:
    print 'Invalid arguements. Usage python scrape.py <site-name>'
    sys.exit(2)
print 'Training the scraper with existing data-set'
s = Scraper()
result = {}
train_data = json.loads(open(scrape_site + '_train.json', 'r').read())
for data in train_data:
    s.train(data['url'], {'name': data['title']})
test_data = json.loads(open(scrape_site + '_tests.json', 'r').read())
for data in test_data:
    result.update(s.scrape(data['url']))
open(scrape_site + '_result.json', 'w').write(json.dumps(result))

コード例 #4

0

ファイルを表示

"""
    @author: 挖掘机小王子
    @contact: [email protected]
    @software: PyCharm
    @file: test.py
    @time: 2019/12/6 11:53
    @desc:
"""
from scrapely import Scraper
import requests


scraper = Scraper()

url = 'https://www.ituring.com.cn/article'
data = {'name': 'duxingzhe', 'title': '这两天的面试经验总结'}
# response = requests.get(url).text
scraper.train(url, data)
result = scraper.scrape(url, encoding='utf-8')
print(result)

コード例 #5

0

ファイルを表示

        if obj <> "k.A":
            graph.add((busRes, BUS[key], Literal(obj)))
    return graph.serialize(format='turtle')

busScraper = Scraper()
busScraper.train(busUrlFormat % '1120301', exampleData)

offset = 0
while True:
    html = scraperwiki.scrape(catalogUrlFormat % offset)
    root = lxml.html.fromstring(html)
    busIds = root.cssselect('input[type=checkbox]')
    if len(busIds) > 0:
        for busCheckbox in busIds:
            busUrl = busUrlFormat % busCheckbox.attrib['value']
            busGraph = CreateGraph(busCheckbox.attrib['value'], busScraper.scrape(busUrl))
            dataStored = {'url': busUrl, 'graph': busGraph}
            scraperwiki.sqlite.save(unique_keys=['url'], data=dataStored)
        offset += 20
    else:
        breakimport scraperwiki           
import lxml.html
from scrapely import Scraper
from rdflib import RDF
from rdflib.graph import Graph
from rdflib import Literal, BNode, Namespace

catalogUrlFormat ='http://www.omnibusrevue.de/buskatalog-578829.html?skip=%d'
busUrlFormat = 'http://www.omnibusrevue.de/bus-%s.html'
busUrlFormatWithName = 'http://www.omnibusrevue.de/{0}-{1}.html'

コード例 #6

0

ファイルを表示

ファイル: train.py プロジェクト: chu888chu888/Crawler-zhw_crawler

from scrapely import Scraper
s = Scraper()
url1 = 'http://movie.douban.com/subject/1292063/'
data1 = {'name': u'美丽人生 La vita è bella', 'author': u'罗伯托·贝尼尼', 'time': '1997-12-20'}
s.train(url1, data1)

url2 = 'http://movie.douban.com/subject/1291560/'
# s.scrape(url2)
data2 = {'name': u'龙猫 となりのトトロ', 'author': u'宫崎骏', 'time': '1988-04-16'}
s.train(url2, data2)

url3 = 'http://movie.douban.com/subject/1293839/'
data3 = {'name': u'罗马假日 Roman Holiday', 'author': u'威廉·惠勒', 'time': '1953-08-27'}
# s.scrape(url3)
s.train(url3, data3)

url4 = 'http://movie.douban.com/subject/1292224/'
s.scrape(url4)


from scrapely import Scraper
s = Scraper()
url1 = 'http://movie.douban.com/subject/1292063/'
data1 = {'name': u'美丽人生 La vita è bella', 'author': u'罗伯托·贝尼尼', 'time': '1997-12-20'}
s.train(url1, data1)

url4 = 'http://movie.douban.com/subject/1292224/'
s.scrape(url4)
# with open('11.txt','wb') as afile:
# 	s.tofile(afile)

コード例 #7

0

ファイルを表示

ファイル: main.py プロジェクト: corbinq27/priceTweeter

    def create_data(self):
        training_url = "http://www.wholesalegaming.biz/startrek/trekalphastarterbox/"
        data_training = {"product": "Star Trek Alpha Unlimited Starter Box", "price": "$15.00"}

        #train scrapely
        scraper = Scraper()

        scraper.train(training_url, data_training)

        #get the URLs to check

        page_json = file("pages_to_check.json").read()

        #format (all strings in unicode) : {"urls" : [ <url1 string>, <url2 string>, ... , <urln string> ] }
        urls_to_check = json.loads(page_json)

        #get data

        #dictionary with "product name": "price"
        price_list = {}

        for each_url in urls_to_check["urls"]:
            scraped_data = scraper.scrape(each_url)
            #example of a scraped data: [{u'price': [u'&nbsp;$15.00&nbsp;'], u'product': [u'Star Trek Alpha Unlimited Starter Box']}]

            #let's sanitize the price to a float and make this a dictionary entry
            dollar_string = scraped_data[0]["price"][0].replace("&nbsp;","")
            removed_dollar_sign = dollar_string.replace("$", "")
            try:
                price_as_float = float(removed_dollar_sign)
            except ValueError:
                #If the value gotten isn't convertable to a float, then it
                #most likely is "Product Unavailable" and we need to deal
                #with this case later down.  N/A will be our tell for that.
                price_as_float = "N/A"
            #get the product name by itself.
            product_name = scraped_data[0]["product"][0]

            #now add the sanitized price and product name to price list
            price_list[product_name] = [price_as_float, each_url]

        #Create a json file of the prices
        timestamp = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
        with open("/tmp/prices-%s.json" % timestamp, "w") as fp:
            json.dump(price_list, fp, sort_keys=True, indent=4)

        #Compare this price list to the most "recent" price list
        recent_price_list = {}

        with open('/tmp/prices-recent.json', 'r') as fp:
            recent_price_list = json.load(fp)

        #This will be the output data of comparing the old data and new data
        #format: {
        #            "product_one_name":
        #                {
        #                     "old_price": <float>
        #                     "new_price": <float>,
        #                     "new_difference": <float of new price - old price>,
        #                     "is_difference": <boolean>,
        #                     "is_new_product": <boolean>,
        #                     "is_discontinued_product": <boolean>
        #                },
        #            "product_two_name":...
        #
        comparison_data = {}

        for old_product, old_price in recent_price_list.iteritems():
            new_difference = 0.0
            is_difference = False
            is_new_product = False
            is_discontinued_product = False
            try:
                new_price = price_list[old_product]
                new_difference = new_price[0] - old_price[0]
            except(KeyError, TypeError):
                #take care of the case that old_product doesn't appear on price_list
                #This also takes care of the case the the old_price isn't a float because
                #the old price is marked as N/A
                new_price = [0.0]
                is_discontinued_product = True

            if new_difference != 0.0:
                is_difference = True

            comparison_data[old_product] = {
                                            "old_price": old_price[0],
                                            "new_price": new_price[0],
                                            "new_difference": new_difference,
                                            "is_difference": is_difference,
                                            "is_new_product": False,
                                            "is_discontinued_product": is_discontinued_product,
                                            "product_url": old_price[1]
                                        }

        #find all items on price_list that is not in recent_price_list
        new_inventory_set = set(price_list.keys()) - set(recent_price_list.keys())
        new_inventory_list = list(new_inventory_set)

        for each_product in new_inventory_list:
            comparison_data[each_product] = { "old_price": 0.0,
                                              "new_price": price_list[each_product][0],
                                              "new_difference": price_list[each_product][0],
                                              "is_difference": True,
                                              "is_new_product": True,
                                              "is_discontinued_product": False,
                                              "product_url": price_list[each_product][1]
                                        }

        #makes it easy to find the always most recent data
        with open("/tmp/price-comparison-recent.json", "w") as fp:
            json.dump(comparison_data, fp, sort_keys=True, indent=4)

        #update the recent prices
        with open("/tmp/prices-recent.json", "w") as fp:
            json.dump(price_list, fp, sort_keys=True, indent=4)

        #Create a file to be the most recent comparison data
        timestamp = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
        if "True" in comparison_data:
            filename = "/tmp/price-comparison-%s.json"
            with open(filename, "w") as fp:
                json.dump(comparison_data, fp, sort_keys=True, indent=4)
                return filename

        return None

コード例 #8

0

ファイルを表示

ファイル: scrapely_test.py プロジェクト: Devin6Tam/python_net_scray

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/3/3 0:27
# @Author  : tanxw

# pip install scrapely
from scrapely import Scraper
s = Scraper()
train_url = 'http://example.webscraping.com/places/default/view/Afghanistan-1'
s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'})
test_url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
s.scrape(test_url)

コード例 #9

0

ファイルを表示

ファイル: spider.py プロジェクト: andrewsmedina/scrap-tools-benchmarking

from scrapely import Scraper

s = Scraper()

url = ""
data = {}
s.scrape(url, data)

コード例 #10

0

ファイルを表示

ファイル: scrape.py プロジェクト: abishekk92/scrub

from scrapely import Scraper
import sys
import json
try:
	scrape_site=sys.argv[1]
except:
	print 'Invalid arguements. Usage python scrape.py <site-name>'
	sys.exit(2)
print 'Training the scraper with existing data-set'
s=Scraper()
result={}
train_data=json.loads(open(scrape_site+'_train.json','r').read())
for data in train_data:
	s.train( data['url'],{'name':data['title']})
test_data=json.loads(open(scrape_site+'_tests.json','r').read())
for data in test_data:
	result.update(s.scrape(data['url']))
open(scrape_site+'_result.json','w').write(json.dumps(result))

コード例 #11

0

ファイルを表示

def scrapely_test():
    s = Scraper()
    train_url = 'http://example.python-scraping.com/view/Afghanistan-1'
    s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'})
    test_url = 'http://example.python-scraping.com/view/United-Kingdom-239'
    print(s.scrape(test_url))

コード例 #12

0

ファイルを表示

ファイル: scrapely_articles.py プロジェクト: catb0y/make_csvs_great_again

    'text': '<div id="article">',
    'author': 'Redazione Journey',
    'date': '22 mar 2017'
}
s.train(url1, data)

# file opener
file_wht = open('test.csv', "wb")
writer = csv.writer(file_wht,
                    delimiter=';',
                    quotechar='"',
                    quoting=csv.QUOTE_ALL)
writer.writerow(("Titolo", "Testo", "Autore", "Data"))

# get stuff
for item in urls:
    try:
        content = s.scrape(item)[0]
        title = h.handle(content["title"][0]).encode('utf-8')
        parsed_text = h.handle(content["text"][0]).encode('utf-8')
        author = h.handle(content["author"][0]).encode('utf-8')
        date = h.handle(content["date"][0]).encode('utf-8')

        print "Success!"
        tpl = (title, parsed_text, author, date)
        writer.writerow(tpl)
    except:
        print ":("

file_wht.close()

コード例 #13

0

ファイルを表示

ファイル: extract_info.py プロジェクト: hupili/backup-hplonline

##pp = pprint.Prettyprint(indent=2)
#pprint.pprint(d)
#print d[0]['title'][0]
#print d[0]['category'][0]
#print d[0]['date'][0]

s.train(url1, data)

import os

data = {}

#for dirname, dirnames, filenames in os.walk('../utf8/'):
#    for filename in filenames:

for fn in os.listdir('../utf8/'):
    print fn
    url2 = '../utf8/' + fn
    d = s.scrape(url2)
    try:
        data[fn] = {
                'title': unicode(d[0]['title'][0]),
                'category': unicode(d[0]['category'][0]),
                'date': unicode(d[0]['date'][0])
            }
    except Exception as e:
        print e

open('data.pickle', 'w').write(pickle.dumps(data))

コード例 #14

0

ファイルを表示

ファイル: scrapely_test1.py プロジェクト: yuandra/scraperwiki-scraper-vault

import urllib, scraperwiki
from scrapely import Scraper

s = Scraper()                             # note how we're *not* using Scraper() - this uses our custom version
url1 = 'http://www.thefest.com/store/beatles-ornaments/the-beatles-applique-stocking-p-3901'
data = {'name': 'THE BEATLES APPLIQUE STOCKING', 'category': 'Beatles Ornaments', 'description': 'BRAND NEW- If you are good, maybe Santa will put something special in this poly/cotton applique stocking - He will have to work overtime to fill this! Measures 19" diagonally from upper left facing to the tip of the toe. This is the first Christmas Beatles Stocking ever offered!', 'price': '$20.00', 'catalog number': '7287'}
s.train(url1,data)
url2 = 'http://www.thefest.com/store/beatles-ornaments/yellow-submarines-light-set-p-3876'
print s.scrape(url2)import urllib, scraperwiki
from scrapely import Scraper

s = Scraper()                             # note how we're *not* using Scraper() - this uses our custom version
url1 = 'http://www.thefest.com/store/beatles-ornaments/the-beatles-applique-stocking-p-3901'
data = {'name': 'THE BEATLES APPLIQUE STOCKING', 'category': 'Beatles Ornaments', 'description': 'BRAND NEW- If you are good, maybe Santa will put something special in this poly/cotton applique stocking - He will have to work overtime to fill this! Measures 19" diagonally from upper left facing to the tip of the toe. This is the first Christmas Beatles Stocking ever offered!', 'price': '$20.00', 'catalog number': '7287'}
s.train(url1,data)
url2 = 'http://www.thefest.com/store/beatles-ornaments/yellow-submarines-light-set-p-3876'
print s.scrape(url2)