Esempio n. 1
0
def auto_generate(sampleurl,data,common_url):
    s = Scraper()
    s.train(sampleurl, data)
    res = (s.scrape(common_url))[0]
    for k,v in res.items():
        res[k] = v[0].replace('\n', '').strip()
    return res
Esempio n. 2
0
    def _fallback(self, template, html, source):
        if not self.scrapely_parser:
            self.scrapely_parser = Scraper()

        html = self.scrapely_parser.HtmlPage(body=html)
        db_objct = self.db.read(uri, objct)
        if not db_objct:
            data = db_objct.attrs_to_dict()

            self.scrapely_parser.train_from_htmlpage(html, data)
            attr_dicts = self.scrapely_parser.scrape_page(html)

            for attr_dict in attr_dicts:
                objct = template._replicate(name=template.name, url=source.url)
                # Add the parsed values.
                objct.attrs_from_dict(attr_dict)
                yield objct
        return []
def main():
    if len(sys.argv) < len(CLI_ARGS)+1:
        print "Usage:", sys.argv[0], " ".join(CLI_ARGS)
        exit()
    try:
        with open(sys.argv[1], 'r') as f:
            data_to_match = sys.argv[2]
            body = f.read()
            scraper = Scraper()
            from scrapely.template import FragmentNotFound
            try:
                decoded_body = univ_encode(body)
                scraper.train_from_htmlpage(HtmlPage(body=decoded_body), {'score': data_to_match})
                print 0
            except FragmentNotFound:
                print -1
                return
    except IOError:
        print -2
        return
def get_scraper(
        url: str,
        scrapers_file_name: str = 'scrapers.json') -> Union[None, Scraper]:
    domain = get_domain(url)
    with open(get_file_path(scrapers_file_name), 'r') as scrapers_file:
        scrapers_json = json.load(scrapers_file)
        if domain in scrapers_json:
            scraper_file_name = scrapers_json[domain]
        else:
            return None

    with open(get_file_path(scraper_file_name), 'r') as scraper_file:
        return Scraper.fromfile(scraper_file)
Esempio n. 5
0
    def test_extraction(self):

        samples_encoding = 'latin1'
        [(html1, data1), (html2, data2)] = list(iter_samples(
            'scraper_loadstore', html_encoding=samples_encoding))
        sc = Scraper()
        page1 = HtmlPage(body=html1, encoding=samples_encoding)
        sc.train_from_htmlpage(page1, data1)

        page2 = HtmlPage(body=html2, encoding=samples_encoding)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)

        # check still works after serialize/deserialize 
        f = StringIO()
        sc.tofile(f)
        f.seek(0)
        sc = Scraper.fromfile(f)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)
Esempio n. 6
0
    def test_train_store_load_scrape(self):
        url1 = "http://www.icone.co.uk/lighting-suspension/copper-shade-by-tom-dixon/tom-dixon/tom-dixon/MSS45UKC/"
        data = {"name": "Copper Shade by Tom Dixon", "designer": "Tom Dixon", "price": "320"}
        s = Scraper()
        s.train(url1, data, encoding="latin1")

        f = StringIO()
        s.tofile(f)

        f.seek(0)
        s = Scraper.fromfile(f)

        url2 = "http://www.icone.co.uk/lighting-wall-and-ceiling/mesmeri-halo-chrome/artemide/eric-sole/0916024A/"
        data = s.scrape(url2, encoding="latin1")
        self.assertEqual(sorted(data[0].keys()), ["designer", "name", "price"])
Esempio n. 7
0
 def __init__(self, threshold=0.75, k=5):
     self.threshold = threshold
     self.k = k
     self.scraper = Scraper()
Esempio n. 8
0
class Depta(object):
    def __init__(self, threshold=0.75, k=5):
        self.threshold = threshold
        self.k = k
        self.scraper = Scraper()

    def extract(self, html='', **kwargs):
        """
        extract data field from raw html or from a url.
        """
        if not html and 'url' in kwargs:
            info = urlopen(kwargs.pop('url'))
            _, html = html_to_unicode(info.headers.get('content_type'), info.read())

        builder = DomTreeBuilder(html)
        root = builder.build()

        region_finder = MiningDataRegion(root, self.k, self.threshold)
        regions = region_finder.find_regions(root)

        record_finder = MiningDataRecord(self.threshold)
        field_finder = MiningDataField()

        for region in regions:
            records = record_finder.find_records(region)
            items, _ = field_finder.align_records(records)
            region.items = items
            if 'verbose' in kwargs:
                print region
                for record in records:
                    print '\t', record

        return regions

    def train(self, seed, data):
        """
        train scrapely from give seed region and data.
        """
        assert data, "Cannot train with empty data"
        htmlpage = self._region_to_htmlpage(seed)
        tm = TemplateMaker(htmlpage)
        if isinstance(data, dict):
            data = data.items()

        for field, values in data:
            if not hasattr(values, '__iter__'):
                values = [values]
            for value in values:
                if isinstance(value, str):
                    value = value.decode(htmlpage.encoding or 'utf-8')
                tm.annotate(field, best_match(value), best_match=False)
        self.scraper.add_template(tm.get_template())


    def infer(self, html='', **kwargs):
        """
        extract data with seed region and the data you expect to scrape from there.
        """
        if 'url' in kwargs:
            info = urlopen(kwargs.pop('url'))
            _, html = html_to_unicode(info.headers.get('content_type'), info.read())

        builder = DomTreeBuilder(html)
        doc = builder.build()
        page = HtmlPage(body=tostring(doc, encoding=unicode, method='html'))

        return self.scraper.scrape_page(page)

    def _region_to_htmlpage(self, region):
        seed_body = tostring(region.parent[region.start], encoding=unicode, method='html')
        return HtmlPage(body=seed_body)
#!/usr/bin/python
# coding=utf-8
import html2text
import csv
from scrapely import Scraper
from urls import urls

h = html2text.HTML2Text()
s = Scraper()

# train
url1 = 'http://www.coca-colaitalia.it/storie/il-primo-ingrediente-dei-nostri-prodotti-e-lacqua'
data = {
    'title':
    'Il primo ingrediente dei nostri prodotti è l’acqua. Ecco come lo preserviamo',
    'text': '<div id="article">',
    'author': 'Redazione Journey',
    'date': '22 mar 2017'
}
s.train(url1, data)

# file opener
file_wht = open('test.csv', "wb")
writer = csv.writer(file_wht,
                    delimiter=';',
                    quotechar='"',
                    quoting=csv.QUOTE_ALL)
writer.writerow(("Titolo", "Testo", "Autore", "Data"))

# get stuff
for item in urls:
Esempio n. 10
0
from scrapely import Scraper
s = Scraper()
url1 = 'http://movie.douban.com/subject/1292063/'
data1 = {'name': u'美丽人生 La vita è bella', 'author': u'罗伯托·贝尼尼', 'time': '1997-12-20'}
s.train(url1, data1)

url2 = 'http://movie.douban.com/subject/1291560/'
# s.scrape(url2)
data2 = {'name': u'龙猫 となりのトトロ', 'author': u'宫崎骏', 'time': '1988-04-16'}
s.train(url2, data2)

url3 = 'http://movie.douban.com/subject/1293839/'
data3 = {'name': u'罗马假日 Roman Holiday', 'author': u'威廉·惠勒', 'time': '1953-08-27'}
# s.scrape(url3)
s.train(url3, data3)

url4 = 'http://movie.douban.com/subject/1292224/'
s.scrape(url4)


from scrapely import Scraper
s = Scraper()
url1 = 'http://movie.douban.com/subject/1292063/'
data1 = {'name': u'美丽人生 La vita è bella', 'author': u'罗伯托·贝尼尼', 'time': '1997-12-20'}
s.train(url1, data1)

url4 = 'http://movie.douban.com/subject/1292224/'
s.scrape(url4)
# with open('11.txt','wb') as afile:
# 	s.tofile(afile)
Esempio n. 11
0
    def create_data(self):
        training_url = "http://www.wholesalegaming.biz/startrek/trekalphastarterbox/"
        data_training = {"product": "Star Trek Alpha Unlimited Starter Box", "price": "$15.00"}

        #train scrapely
        scraper = Scraper()

        scraper.train(training_url, data_training)

        #get the URLs to check

        page_json = file("pages_to_check.json").read()

        #format (all strings in unicode) : {"urls" : [ <url1 string>, <url2 string>, ... , <urln string> ] }
        urls_to_check = json.loads(page_json)

        #get data

        #dictionary with "product name": "price"
        price_list = {}

        for each_url in urls_to_check["urls"]:
            scraped_data = scraper.scrape(each_url)
            #example of a scraped data: [{u'price': [u'&nbsp;$15.00&nbsp;'], u'product': [u'Star Trek Alpha Unlimited Starter Box']}]

            #let's sanitize the price to a float and make this a dictionary entry
            dollar_string = scraped_data[0]["price"][0].replace("&nbsp;","")
            removed_dollar_sign = dollar_string.replace("$", "")
            try:
                price_as_float = float(removed_dollar_sign)
            except ValueError:
                #If the value gotten isn't convertable to a float, then it
                #most likely is "Product Unavailable" and we need to deal
                #with this case later down.  N/A will be our tell for that.
                price_as_float = "N/A"
            #get the product name by itself.
            product_name = scraped_data[0]["product"][0]

            #now add the sanitized price and product name to price list
            price_list[product_name] = [price_as_float, each_url]

        #Create a json file of the prices
        timestamp = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
        with open("/tmp/prices-%s.json" % timestamp, "w") as fp:
            json.dump(price_list, fp, sort_keys=True, indent=4)

        #Compare this price list to the most "recent" price list
        recent_price_list = {}

        with open('/tmp/prices-recent.json', 'r') as fp:
            recent_price_list = json.load(fp)

        #This will be the output data of comparing the old data and new data
        #format: {
        #            "product_one_name":
        #                {
        #                     "old_price": <float>
        #                     "new_price": <float>,
        #                     "new_difference": <float of new price - old price>,
        #                     "is_difference": <boolean>,
        #                     "is_new_product": <boolean>,
        #                     "is_discontinued_product": <boolean>
        #                },
        #            "product_two_name":...
        #
        comparison_data = {}

        for old_product, old_price in recent_price_list.iteritems():
            new_difference = 0.0
            is_difference = False
            is_new_product = False
            is_discontinued_product = False
            try:
                new_price = price_list[old_product]
                new_difference = new_price[0] - old_price[0]
            except(KeyError, TypeError):
                #take care of the case that old_product doesn't appear on price_list
                #This also takes care of the case the the old_price isn't a float because
                #the old price is marked as N/A
                new_price = [0.0]
                is_discontinued_product = True

            if new_difference != 0.0:
                is_difference = True

            comparison_data[old_product] = {
                                            "old_price": old_price[0],
                                            "new_price": new_price[0],
                                            "new_difference": new_difference,
                                            "is_difference": is_difference,
                                            "is_new_product": False,
                                            "is_discontinued_product": is_discontinued_product,
                                            "product_url": old_price[1]
                                        }

        #find all items on price_list that is not in recent_price_list
        new_inventory_set = set(price_list.keys()) - set(recent_price_list.keys())
        new_inventory_list = list(new_inventory_set)

        for each_product in new_inventory_list:
            comparison_data[each_product] = { "old_price": 0.0,
                                              "new_price": price_list[each_product][0],
                                              "new_difference": price_list[each_product][0],
                                              "is_difference": True,
                                              "is_new_product": True,
                                              "is_discontinued_product": False,
                                              "product_url": price_list[each_product][1]
                                        }

        #makes it easy to find the always most recent data
        with open("/tmp/price-comparison-recent.json", "w") as fp:
            json.dump(comparison_data, fp, sort_keys=True, indent=4)

        #update the recent prices
        with open("/tmp/prices-recent.json", "w") as fp:
            json.dump(price_list, fp, sort_keys=True, indent=4)

        #Create a file to be the most recent comparison data
        timestamp = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
        if "True" in comparison_data:
            filename = "/tmp/price-comparison-%s.json"
            with open(filename, "w") as fp:
                json.dump(comparison_data, fp, sort_keys=True, indent=4)
                return filename

        return None
class TrendingSpider(Spider):
    name="trending_monitor"
    """docstring for TrendingSpider"""
    start_urls = []
    PRINT_STATS_EVERY_X_CRAWLED_PAGES = 100
    links_rule = None
    urls_seen = set()
    aborted = False
    crawled_all_pages = 0
    score_field_text_negative_matches = []
    def make_requests_from_url(self, url):
        return Request(url, dont_filter=True, meta={'start_url': url, 'metadepth': 0})

    # rules = (
        # Rule(SgmlLinkExtractor(allow=r'.+', deny=(r'.*(spampoison.*|cgi\/.*|accounts\.google\.com|login.*|microsoft\.com|\.(js|css|png|jpe?g|gif|bmp|tiff)(\?.*)?)')), follow=False, callback='parse_item'),
    # )

    def __init__(self, db_path, pid):
        print "\n===============================" * 2
        print "Starting TrendingSpider... FOR PID=", pid
        print "\n===============================" * 2
        self.project_id = int(pid)
        self.db_path = db_path
        self.fetch_project_data()
        if self.aborted:
            return
        print "Loaded", len(self.start_urls), "starting urls"
        self.start_time = time()
        self.crawled_pages = 0
        # This has to be set after we run fetch_project_data()
        self.links_rule = Rule(
            SgmlLinkExtractor(
                allow='.+',
                deny=(r'.*(spampoison.*|cgi\/.*|accounts\.google\.com|login.*|\.(js|css|png|jpe?g|gif|bmp|tiff)(\?.*)?)')
            ),
            follow=False,
            callback='parse_item'
        )
        self.links_rule_targeted = Rule(
            SgmlLinkExtractor(
                allow=self.allow_regexp,
                deny=(r'.*(spampoison.*|cgi\/.*|accounts\.google\.com|login.*|\.(js|css|png|jpe?g|gif|bmp|tiff)(\?.*)?)')
            ),
            follow=False,
            callback='parse_item'
        )
        super(TrendingSpider, self).__init__()
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_closed(self):
        print "Closing spider, crawled", self.crawled_pages
        if self.db is not None:
            self.db.commit()

    def done(self):
        return self.urls_limit is not 0 and (self.crawled_pages > self.urls_limit or self.crawled_all_pages > (self.urls_limit*10))

    def parse(self, response):
        self.crawled_all_pages += 1
        # This condition is there because even if we stopped adding new requests, we might still have more requests 
        # done in total than the self.url_limits
        # why? Because we stop when we reached sefl.urls_limit in terms of _crawled_ urls and not in terms of URLs 
        # added to the queue. This allows us to ensure we always crawl _at least_ self.urls_limit URLs but in return
        # we will most likely always crawl more than self.urls_limit because we will likely add new URLs before some
        # URLs in the queue (the queue having already reached the limit) have been fetched
        if self.done():
            return
        if (self.crawled_pages % self.PRINT_STATS_EVERY_X_CRAWLED_PAGES) is 0:  
            delta = time()-self.start_time
            print "Current crawl speed: ", self.crawled_pages, "urls crawled,", delta, "seconds,", self.crawled_pages / delta, "pages/second"
        if self.links_rule_targeted.link_extractor.matches(response.url):
            print "page targeted", response.url
            self.crawled_pages += 1
            html_p = htmlpage_from_response(response)
            scraped_result = self.scraper.scrape_page(html_p)
            score = scraped_result[0]['score'][0]
            if self.score_field_text_negative_matches:
                for to_strip_off in self.score_field_text_negative_matches:
                    score = score.replace(to_strip_off, '')
            print "\n===============================" * 2
            print "score=", score
            print "\n===============================" * 2
            item = (
                response.url,
                score,
                int(time())
            )
            self.save_to_db(item)
        if self.done(): # wasting a little bit resources here because of ">" instead of ">="
            return  # We do not scrap the links, this time
        unique_new_links = set(
            [
                l for l in self.links_rule.link_extractor.extract_links(response) 
                if len(l.url) <= 255 and TrendingSpider.extract_domain(l.url) == self.our_domain
            ]) - self.urls_seen

        print "Got", len(unique_new_links), "new links"
        self.urls_seen |= unique_new_links
        return [Request(link.url) for link in unique_new_links]

    def save_to_db(self, item):
        self.db.execute('INSERT INTO result(TIMESTAMP, SCORE, PAGE, SEARCH_ID) VALUES(?, ?, ?, ?)',
            (
                item[2],
                item[1],
                item[0],
                self.project_id
            )
        )
        self.db.commit()

    def init_db(self):
        import sqlite3
        self.db = sqlite3.connect(self.db_path)

    def abort(self):
        sys.stderr.write("\n===============================" * 2)
        sys.stderr.write("\nSomething went wrong, aborting.")
        sys.stderr.write("\n===============================" * 2)
        self.start_urls = []
        self.aborted = True

    def fetch_project_data(self):
        self.init_db()
        # Fetch data from DB
        test=str(self.project_id)
        c = self.db.execute('SELECT * FROM search WHERE id=?', (test,))
        d = c.fetchone()
        if d is None:
            perr("No project found in DB")
            return self.abort()
        data_to_match = {'score': d[1]}
        body = d[2]
        url = d[3]
        self.our_domain = TrendingSpider.extract_domain(url)
        self.start_urls = [url]  # This is one of the improvements we could implement
        from scrapely.template import FragmentNotFound
        try:
            self.setup_scraper(body, url, data_to_match)
        except FragmentNotFound:
            perr("Unable to learn from data")
            # We were not able to learn, cancel the crawl by having no start urls
            return self.abort()
        self.allow_regexp = d[5]
        self.urls_limit = int(d[6])
        if d[7] != '' and d[7] is not None:
            self.score_field_text_negative_matches = d[7].split(d[8])
        print "urls_limit=", self.urls_limit

    def setup_scraper(self, body, url, data_to_scrape):
        self.scraper = Scraper()
        decoded_body = univ_encode(body)
        self.scraper.train_from_htmlpage(HtmlPage(url=url, body=decoded_body), data_to_scrape)
    
    @staticmethod
    def extract_domain(url):
        try:
            url = url[url.index("//")+2:] # getting rid of protocol://
        except ValueError:
            # There was no protocol specified
            pass
        try:
            url = url[:url.index("/")] # getting rid of everything after the first "/"
        except ValueError:
            # Maybe it was a domain-onl   y url, with no "/"
            pass
        return url
Esempio n. 13
0
 def __init__(self, threshold=0.75, k=5):
     self.threshold = threshold
     self.k = k
     self.scraper = Scraper()
Esempio n. 14
0
from scrapely import Scraper

s = Scraper()

url = ""
data = {}
s.train(url, data)
Esempio n. 15
0
from scrapely import Scraper
import sys
import json
try:
	scrape_site=sys.argv[1]
except:
	print 'Invalid arguements. Usage python scrape.py <site-name>'
	sys.exit(2)
print 'Training the scraper with existing data-set'
s=Scraper()
result={}
train_data=json.loads(open(scrape_site+'_train.json','r').read())
for data in train_data:
	s.train( data['url'],{'name':data['title']})
test_data=json.loads(open(scrape_site+'_tests.json','r').read())
for data in test_data:
	result.update(s.scrape(data['url']))
open(scrape_site+'_result.json','w').write(json.dumps(result))

Esempio n. 16
0
def scrapely_test():
    s = Scraper()
    train_url = 'http://example.python-scraping.com/view/Afghanistan-1'
    s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'})
    test_url = 'http://example.python-scraping.com/view/United-Kingdom-239'
    print(s.scrape(test_url))
Esempio n. 17
0
def update_scrapers_file(url):
    domain = re.search(r'(?<=\/\/)[\w\.-]+(?=\/)', url).group()
    scraper_file_name = ""
    scrapers_json = {}
    with open('scrapers.json', 'r') as scrapers_file:
        scrapers_json = json.load(scrapers_file)

    scraper_file_name = domain + ".json"
    scrapers_json[domain] = scraper_file_name
    with open('scrapers.json', 'w') as scrapers_file:
        json.dump(scrapers_json, scrapers_file)

    return scraper_file_name


# TODO add help and verbose modes
# TODO add arg validation and error feedback
scraper = Scraper()
training_params = open_training_file()
assert training_params, "no training parameters found in {}".format(
    sys.argv[1])
url = training_params['url']
params = training_params['params']
scraper.train(url, params)
# TODO replace this with database action and maybe do checksum compare to avoid writing same scraper more than once?
scraper_file_name = update_scrapers_file(url)

with open(scraper_file_name, 'w') as scraper_file:
    scraper.tofile(scraper_file)
Esempio n. 18
0
class HTMLParser(BaseParser):
    '''
    A parser that is able to parse html.
    '''
    def __init__(self, **kwargs):
        super(HTMLParser, self).__init__(**kwargs)
        self.scrapely_parser = None
        for key, value in kwargs.items():
            setattr(self, key, value)

    def _prepare_data(self, source):
        json_key = source.json_key
        data = source.data.decode('utf8')
        if json_key:  # if the data is json, return it straightaway
            json_raw = json.loads(data)
            if hasattr(json_key, '__iter__') and json_key[0] in json_raw:
                data = reduce(dict.get, json_key, json_raw)
            elif type(json_key) == str and json_key in json_raw:
                data = json_raw[json_key]
            else:
                return False
        try:  # Create an HTML object from the returned text.
            data = lxhtml.fromstring(data)
        except ValueError:  # This happens when xml is declared in html.
            data = lxhtml.fromstring('\n'.join(data.split('\n')[1:]))
        except TypeError:
            print(data)
            print('Something weird has been returned by the server.')
        data.make_links_absolute(self.domain)
        return data

    def _get_selector(self, model):
        # assert len(model.selector) == 1, "Only one selector can be used."
        if model.selector:
            if type(model.selector) in (CSSSelector, XPath):
                return model.selector
            else:
                try:
                    return CSSSelector(model.selector[0])
                except SelectorSyntaxError:
                    return XPath(model.selector[0])
                except:
                    raise Exception('Not a valid css or xpath selector',
                                    model.selector)
        return None

    def _apply_selector(self, selector, data):
        if selector:
            return selector(data)
        else:
            return (data, )

    def _extract(self, html, template):
        # We have normal html
        if not template.js_regex:
            if html is not None:
                extracted = self._apply_selector(template.selector, html)
            else:
                extracted = []
        # We want to extract a json_variable from the server
        else:
            regex = re.compile(template.js_regex)
            extracted = []
            # Find all the scripts that match the regex.
            scripts = (regex.findall(s.text_content())[0]
                       for s in html.cssselect('script')
                       if regex.search(s.text_content()))

            # Set selected to the scripts
            for script in scripts:
                extracted.extend(json.loads(script))
        return extracted

    def _source_from_object(self, objct, source):
        # TODO fix that the source object can determine for itself where data
        # or params should be placed in the object.
        new_source = objct.source._replicate()
        attrs = {
            attr.name: attr.value
            for attr in objct.attrs.values() if attr.name != 'url'
        }

        if not getattr(new_source, 'url', None):
            url = objct.attrs.get('url')

            if url and not isinstance(url, list):
                new_source.url = self.parent._apply_src_template(
                    source, url.value)
            else:
                new_source.url = self.parent._apply_src_template(
                    source, source.url)

        if new_source.copy_attrs:
            new_source = self._copy_attrs(objct, new_source)

        if new_source.parent:
            new_source.attrs['_parent'] = objct.attrs['url']._replicate()

        if new_source.method == 'post':
            new_source.data = {**new_source.data, **attrs}  # noqa
        else:
            new_source.params = attrs

        self.parent._add_source(new_source)

    def _fallback(self, template, html, source):
        if not self.scrapely_parser:
            self.scrapely_parser = Scraper()

        html = self.scrapely_parser.HtmlPage(body=html)
        db_objct = self.db.read(uri, objct)
        if not db_objct:
            data = db_objct.attrs_to_dict()

            self.scrapely_parser.train_from_htmlpage(html, data)
            attr_dicts = self.scrapely_parser.scrape_page(html)

            for attr_dict in attr_dicts:
                objct = template._replicate(name=template.name, url=source.url)
                # Add the parsed values.
                objct.attrs_from_dict(attr_dict)
                yield objct
        return []

    def _convert_to_element(self, parsed):
        elements = []
        for p in parsed:
            if not type(p) == lxhtml.HtmlElement:
                elem = lxhtml.Element('p')
                elem.text = p
                elements.append(elem)
        return elements

    @add_other_doc(BaseParser.modify_text)
    def sel_text(self, elements, all_text=True, **kwargs):  # noqa
        '''
        Select all text for a given selector.
        '''
        if all_text:
            text = [el.text_content() for el in elements]
        else:
            text = [el.text for el in elements]
        return self._sel_text(text, **kwargs)

    def sel_table(self, elements, columns: int = 2, offset: int = 0):
        '''
        Parses a nxn table into a dictionary.
        Works best when the input is a td selector.
        Specify the amount of columns with the columns parameter.
        example:
            parse a 2x2 table
            {'func': sel_table,
            'params': {
                'selector': CSSSelector('table td'),
                'columns': 2,
                'offset': 0,
                }
            }
            leads to:
            sel_table(html=lxml.etree, selector=CSSSelector('table td'),
                    columns=2, offset=0)
        '''
        keys = [el.text for el in elements[offset::columns]]
        values = [el.text for el in elements[1::columns]]
        return dict(zip(keys, values))

    def sel_row(self,
                elements,
                row_selector: int = None,
                value: str = '',
                attr=None,
                index=None):
        rows = [row for row in elements if value in row.text_contents()]
        if attr:
            selected = [
                sel for sel in sel_attr(row, row_selector) for row in rows
            ]
        else:
            selected = [
                sel for sel in sel_text(row, row_selector) for row in rows
            ]
        return self._value(selected, index)

    def sel_attr(self, elements, attr: str = '', **kwargs):
        '''
        Extract an attribute of an HTML element. Will return
        a list of attributes if multiple tags match the
        selector.

        The **kwargs are the keyword arguments that can be added are from
        the BaseParser.modify_text method.
        '''

        attrs = (el.attrib.get(attr) for el in elements)
        return self._sel_text(attrs, **kwargs)

    def sel_url(self, elements, index: int = None, **kwargs):
        return self.sel_attr(elements, attr='href', index=index, **kwargs)

    def sel_date(self,
                 elements,
                 fmt: str = 'YYYYmmdd',
                 attr: str = None,
                 index: int = None):
        '''
        Returns a python date object with the specified format.
        '''
        if attr:
            date = sel_attr(html, selector, attr=attr, index=index)
        else:
            date = sel_text(html, selector, index=index)
        if date:
            return datetime.strptime(date, fmt)

    def sel_exists(self, elements, key: str = '', index: int = None):
        '''
        Return True if a keyword is in the selector text,
        '''
        text = self.sel_text(elements)
        if text:
            if key in text:
                return True
            return False

    def sel_raw_html(self, elements):
        return [el.raw_html for el in elements]

    def sel_json(self, obj, selector, key=''):
        return obj.get(key)

    def sel_js_array(self, elements, var_name='', var_type=None):
        var_regex = 'var\s*' + var_name + '\s*=\s*(?:new Array\(|\[)(.*)(?:\)|\]);'
        array_string = self.sel_text(elements, regex=var_regex)
        if array_string:
            if var_type:
                return list(map(var_type, array_string.split(',')))
            return array_string.split(',')

    def fill_form(self, elements, fields={}, attrs=[]):
        for form in elements:
            data = {**dict(form.form_values()), **fields}
            source = Source(url=form.action,
                            method=form.method,
                            duplicate=True,
                            attrs=attrs)
            if source.method == 'GET':
                source.params = data
            else:
                source.data = data
            self._add_source(source)
Esempio n. 19
0
BUS = Namespace("http://purl.org/wikibus/omnibusrevue/")
OR = Namespace("http://purl.org/wikibus/omnibusrevue/bus/")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")

def CreateGraph(busId, busData):
    graph = Graph()
    busRes = OR[busId]
    graph.add((busRes, RDF.type, BUS["Bus"]))
    graph.add((busRes, FOAF["page"], Literal(busUrlFormatWithName.format(busData[0]['model'][0].encode('utf-8'), busId))))
    for key in busData[0]:        
        obj = busData[0][key][0].encode('utf-8')
        if obj <> "k.A":
            graph.add((busRes, BUS[key], Literal(obj)))
    return graph.serialize(format='turtle')

busScraper = Scraper()
busScraper.train(busUrlFormat % '1120301', exampleData)

offset = 0
while True:
    html = scraperwiki.scrape(catalogUrlFormat % offset)
    root = lxml.html.fromstring(html)
    busIds = root.cssselect('input[type=checkbox]')
    if len(busIds) > 0:
        for busCheckbox in busIds:
            busUrl = busUrlFormat % busCheckbox.attrib['value']
            busGraph = CreateGraph(busCheckbox.attrib['value'], busScraper.scrape(busUrl))
            dataStored = {'url': busUrl, 'graph': busGraph}
            scraperwiki.sqlite.save(unique_keys=['url'], data=dataStored)
        offset += 20
    else:
Esempio n. 20
0
# Google Search Package: https://breakingcode.wordpress.com/2010/06/29/google-search-python/
# Scrapely Package: https://github.com/scrapy/scrapely
# https://www.analyticsvidhya.com/blog/2015/10/beginner-guide-web-scraping-beautiful-soup-python/
# https://stackoverflow.com/questions/3898574/google-search-using-python-script

#imports
import urllib2
from bs4 import BeautifulSoup
from googlesearch.googlesearch import GoogleSearch
import csv
from scrapely import Scraper
from bs4 import UnicodeDammit
from collections import Counter
import re
import time
s = Scraper()


query = raw_input("Search Query: ")
try:
    n = int(raw_input("# of Websites to Scrape: "))
except ValueError:
    print "Enter Valid # of Websites"
    sys.exit()
'''
UNIXtime = int(time.time())
filename = query.replace(" ","_").lower()+"_"+str(n)+"_"+str(UNIXtime)
print filename
'''
# initialize dictionary to store search results
# rows: Name, Author, Description, Url
Esempio n. 21
0
"""
    @author: 挖掘机小王子
    @contact: [email protected]
    @software: PyCharm
    @file: test.py
    @time: 2019/12/6 11:53
    @desc:
"""
from scrapely import Scraper
import requests


scraper = Scraper()

url = 'https://www.ituring.com.cn/article'
data = {'name': 'duxingzhe', 'title': '这两天的面试经验总结'}
# response = requests.get(url).text
scraper.train(url, data)
result = scraper.scrape(url, encoding='utf-8')
print(result)
Esempio n. 22
0
# -*- coding: utf-8 -*-

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import pickle
from scrapely import Scraper
s = Scraper()

url1='ab4bc711263c261eb8127bcb.html'
data={
    'title': 'pthread对多线程访问全局数据结构的支持',
    'date': '2010-09-20  22:03',
    'category': '类别:Linux'
        }

if len(sys.argv) > 1:
    url2=sys.argv[1]
else:
    url2='fa2ebd45db2fd724cefca317.html'

#import pprint
##pp = pprint.Prettyprint(indent=2)
#pprint.pprint(d)
#print d[0]['title'][0]
#print d[0]['category'][0]
#print d[0]['date'][0]

s.train(url1, data)
Esempio n. 23
0
class Depta(object):
    def __init__(self, threshold=0.75, k=5):
        self.threshold = threshold
        self.k = k
        self.scraper = Scraper()

    def extract(self, html='', **kwargs):
        """
        extract data field from raw html or from a url.
        """
        if not html and 'url' in kwargs:
            info = urlopen(kwargs.pop('url'))
            _, html = html_to_unicode(info.headers.get('content_type'),
                                      info.read())

        builder = DomTreeBuilder(html)
        root = builder.build()

        region_finder = MiningDataRegion(root, self.k, self.threshold)
        regions = region_finder.find_regions(root)

        record_finder = MiningDataRecord(self.threshold)
        field_finder = MiningDataField()

        for region in regions:
            records = record_finder.find_records(region)
            items, _ = field_finder.align_records(records)
            region.items = items
            if 'verbose' in kwargs:
                print region
                for record in records:
                    print '\t', record

        return regions

    def train(self, seed, data):
        """
        train scrapely from give seed region and data.
        """
        assert data, "Cannot train with empty data"
        htmlpage = self._region_to_htmlpage(seed)
        dtm = DeptaTemplateMaker(htmlpage)
        if isinstance(data, dict):
            data = data.items()

        for field, values in data:
            if not hasattr(values, '__iter__'):
                values = [values]
            for value in values:
                if isinstance(value, str):
                    value = value.decode(htmlpage.encoding or 'utf-8')
                dtm.annotate(field, best_match(value))
        self.scraper.add_template(dtm.get_template())

    def infer(self, html='', **kwargs):
        """
        extract data with seed region and the data you expect to scrape from there.
        """
        if 'url' in kwargs:
            info = urlopen(kwargs.pop('url'))
            _, html = html_to_unicode(info.headers.get('content_type'),
                                      info.read())

        builder = DomTreeBuilder(html)
        doc = builder.build()
        page = HtmlPage(body=tostring(doc, encoding=unicode, method='html'))

        return self._scrape_page(page)

    def _scrape_page(self, page):
        if self.scraper._ex is None:
            self.scraper._ex = DeptaIBLExtractor(
                (t, None) for t in self.scraper._templates)
        return self.scraper._ex.extract(page)[0]

    def _region_to_htmlpage(self, region):
        seed_body = tostring(region.parent[region.start],
                             encoding=unicode,
                             method='html')
        return HtmlPage(body=seed_body)
 def setup_scraper(self, body, url, data_to_scrape):
     self.scraper = Scraper()
     decoded_body = univ_encode(body)
     self.scraper.train_from_htmlpage(HtmlPage(url=url, body=decoded_body), data_to_scrape)
Esempio n. 25
0
from scrapely import Scraper
import sys
import json
try:
    scrape_site = sys.argv[1]
except:
    print 'Invalid arguements. Usage python scrape.py <site-name>'
    sys.exit(2)
print 'Training the scraper with existing data-set'
s = Scraper()
result = {}
train_data = json.loads(open(scrape_site + '_train.json', 'r').read())
for data in train_data:
    s.train(data['url'], {'name': data['title']})
test_data = json.loads(open(scrape_site + '_tests.json', 'r').read())
for data in test_data:
    result.update(s.scrape(data['url']))
open(scrape_site + '_result.json', 'w').write(json.dumps(result))
from scrapely import Scraper

s = Scraper()

url = ""
data = {}
s.scrape(url, data)
Esempio n. 27
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/3/3 0:27
# @Author  : tanxw

# pip install scrapely
from scrapely import Scraper
s = Scraper()
train_url = 'http://example.webscraping.com/places/default/view/Afghanistan-1'
s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'})
test_url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
s.scrape(test_url)
import urllib, scraperwiki
from scrapely import Scraper

s = Scraper()                             # note how we're *not* using Scraper() - this uses our custom version
url1 = 'http://www.thefest.com/store/beatles-ornaments/the-beatles-applique-stocking-p-3901'
data = {'name': 'THE BEATLES APPLIQUE STOCKING', 'category': 'Beatles Ornaments', 'description': 'BRAND NEW- If you are good, maybe Santa will put something special in this poly/cotton applique stocking - He will have to work overtime to fill this! Measures 19" diagonally from upper left facing to the tip of the toe. This is the first Christmas Beatles Stocking ever offered!', 'price': '$20.00', 'catalog number': '7287'}
s.train(url1,data)
url2 = 'http://www.thefest.com/store/beatles-ornaments/yellow-submarines-light-set-p-3876'
print s.scrape(url2)import urllib, scraperwiki
from scrapely import Scraper

s = Scraper()                             # note how we're *not* using Scraper() - this uses our custom version
url1 = 'http://www.thefest.com/store/beatles-ornaments/the-beatles-applique-stocking-p-3901'
data = {'name': 'THE BEATLES APPLIQUE STOCKING', 'category': 'Beatles Ornaments', 'description': 'BRAND NEW- If you are good, maybe Santa will put something special in this poly/cotton applique stocking - He will have to work overtime to fill this! Measures 19" diagonally from upper left facing to the tip of the toe. This is the first Christmas Beatles Stocking ever offered!', 'price': '$20.00', 'catalog number': '7287'}
s.train(url1,data)
url2 = 'http://www.thefest.com/store/beatles-ornaments/yellow-submarines-light-set-p-3876'
print s.scrape(url2)