def auto_generate(sampleurl,data,common_url): s = Scraper() s.train(sampleurl, data) res = (s.scrape(common_url))[0] for k,v in res.items(): res[k] = v[0].replace('\n', '').strip() return res
def _fallback(self, template, html, source): if not self.scrapely_parser: self.scrapely_parser = Scraper() html = self.scrapely_parser.HtmlPage(body=html) db_objct = self.db.read(uri, objct) if not db_objct: data = db_objct.attrs_to_dict() self.scrapely_parser.train_from_htmlpage(html, data) attr_dicts = self.scrapely_parser.scrape_page(html) for attr_dict in attr_dicts: objct = template._replicate(name=template.name, url=source.url) # Add the parsed values. objct.attrs_from_dict(attr_dict) yield objct return []
def test_extraction(self): samples_encoding = 'latin1' [(html1, data1), (html2, data2)] = list(iter_samples( 'scraper_loadstore', html_encoding=samples_encoding)) sc = Scraper() page1 = HtmlPage(body=html1, encoding=samples_encoding) sc.train_from_htmlpage(page1, data1) page2 = HtmlPage(body=html2, encoding=samples_encoding) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2) # check still works after serialize/deserialize f = StringIO() sc.tofile(f) f.seek(0) sc = Scraper.fromfile(f) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2)
# Google Search Package: https://breakingcode.wordpress.com/2010/06/29/google-search-python/ # Scrapely Package: https://github.com/scrapy/scrapely # https://www.analyticsvidhya.com/blog/2015/10/beginner-guide-web-scraping-beautiful-soup-python/ # https://stackoverflow.com/questions/3898574/google-search-using-python-script #imports import urllib2 from bs4 import BeautifulSoup from googlesearch.googlesearch import GoogleSearch import csv from scrapely import Scraper from bs4 import UnicodeDammit from collections import Counter import re import time s = Scraper() query = raw_input("Search Query: ") try: n = int(raw_input("# of Websites to Scrape: ")) except ValueError: print "Enter Valid # of Websites" sys.exit() ''' UNIXtime = int(time.time()) filename = query.replace(" ","_").lower()+"_"+str(n)+"_"+str(UNIXtime) print filename ''' # initialize dictionary to store search results # rows: Name, Author, Description, Url
def __init__(self, threshold=0.75, k=5): self.threshold = threshold self.k = k self.scraper = Scraper()
""" @author: 挖掘机小王子 @contact: [email protected] @software: PyCharm @file: test.py @time: 2019/12/6 11:53 @desc: """ from scrapely import Scraper import requests scraper = Scraper() url = 'https://www.ituring.com.cn/article' data = {'name': 'duxingzhe', 'title': '这两天的面试经验总结'} # response = requests.get(url).text scraper.train(url, data) result = scraper.scrape(url, encoding='utf-8') print(result)
BUS = Namespace("http://purl.org/wikibus/omnibusrevue/") OR = Namespace("http://purl.org/wikibus/omnibusrevue/bus/") FOAF = Namespace("http://xmlns.com/foaf/0.1/") def CreateGraph(busId, busData): graph = Graph() busRes = OR[busId] graph.add((busRes, RDF.type, BUS["Bus"])) graph.add((busRes, FOAF["page"], Literal(busUrlFormatWithName.format(busData[0]['model'][0].encode('utf-8'), busId)))) for key in busData[0]: obj = busData[0][key][0].encode('utf-8') if obj <> "k.A": graph.add((busRes, BUS[key], Literal(obj))) return graph.serialize(format='turtle') busScraper = Scraper() busScraper.train(busUrlFormat % '1120301', exampleData) offset = 0 while True: html = scraperwiki.scrape(catalogUrlFormat % offset) root = lxml.html.fromstring(html) busIds = root.cssselect('input[type=checkbox]') if len(busIds) > 0: for busCheckbox in busIds: busUrl = busUrlFormat % busCheckbox.attrib['value'] busGraph = CreateGraph(busCheckbox.attrib['value'], busScraper.scrape(busUrl)) dataStored = {'url': busUrl, 'graph': busGraph} scraperwiki.sqlite.save(unique_keys=['url'], data=dataStored) offset += 20 else:
def scrapely_test(): s = Scraper() train_url = 'http://example.python-scraping.com/view/Afghanistan-1' s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'}) test_url = 'http://example.python-scraping.com/view/United-Kingdom-239' print(s.scrape(test_url))
import urllib, scraperwiki from scrapely import Scraper s = Scraper() # note how we're *not* using Scraper() - this uses our custom version url1 = 'http://www.thefest.com/store/beatles-ornaments/the-beatles-applique-stocking-p-3901' data = {'name': 'THE BEATLES APPLIQUE STOCKING', 'category': 'Beatles Ornaments', 'description': 'BRAND NEW- If you are good, maybe Santa will put something special in this poly/cotton applique stocking - He will have to work overtime to fill this! Measures 19" diagonally from upper left facing to the tip of the toe. This is the first Christmas Beatles Stocking ever offered!', 'price': '$20.00', 'catalog number': '7287'} s.train(url1,data) url2 = 'http://www.thefest.com/store/beatles-ornaments/yellow-submarines-light-set-p-3876' print s.scrape(url2)import urllib, scraperwiki from scrapely import Scraper s = Scraper() # note how we're *not* using Scraper() - this uses our custom version url1 = 'http://www.thefest.com/store/beatles-ornaments/the-beatles-applique-stocking-p-3901' data = {'name': 'THE BEATLES APPLIQUE STOCKING', 'category': 'Beatles Ornaments', 'description': 'BRAND NEW- If you are good, maybe Santa will put something special in this poly/cotton applique stocking - He will have to work overtime to fill this! Measures 19" diagonally from upper left facing to the tip of the toe. This is the first Christmas Beatles Stocking ever offered!', 'price': '$20.00', 'catalog number': '7287'} s.train(url1,data) url2 = 'http://www.thefest.com/store/beatles-ornaments/yellow-submarines-light-set-p-3876' print s.scrape(url2)