Python Scraper.Scraper Examples

Programming Language: Python

Namespace/Package Name: sky.scraper

Class/Type: Scraper

Method/Function: Scraper

Examples at hotexamples.com: 4

Python Scraper.Scraper - 4 examples found. These are the top rated real world Python examples of sky.scraper.Scraper.Scraper extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Scraper(4)

add_template_elements(3)

load_local_pages(3)

process_all(3)

process(1)

Example #1

Show file

    def scrape_data(self):
        # Create boilerplate recognizer
        skindex = Scraper(self.crawl_config)

        skindex.load_local_pages()
        skindex.add_template_elements()

        # Process all by removing boilerplate and extracting information
        return skindex.process_all(exclude_data=['cleaned', 'author'])

Example #2

Show file

    def post(self):
        CRAWL_CONFIG = DEFAULT_CRAWL_CONFIG.copy()
        CRAWL_CONFIG.update({
            'collections_path': os.path.join(os.path.expanduser('~'), 'sky_view_collections/'),
            # 'max_workers': 10,
        })
        args = self.request.arguments
        print(args)
        for arg in args:
            value = args[arg][0].decode('utf8')
            if value and arg != 'url' and arg != 'checkboxcache':
                print('pre', arg, CRAWL_CONFIG[arg])
                if isinstance(CRAWL_CONFIG[arg], list):
                    CRAWL_CONFIG[arg] = [int(value)] if is_numeric(value) else value.split(', ')
                else:
                    CRAWL_CONFIG[arg] = int(value) if is_numeric(value) else value.split(', ')[0]
                print('post', arg, CRAWL_CONFIG[arg])

        url = self.get_argument('url', '')

        use_cache = self.get_argument('checkboxcache', '')

        domain = extractDomain(url)
        CRAWL_CONFIG['seed_urls'] = [url]
        if domain.startswith("http"):
            CRAWL_CONFIG['collection_name'] = domain.split("/")[2]
        else:
            CRAWL_CONFIG['collection_name'] = domain.split("/")[0]

        if use_cache != 'on':

            col_path = os.path.join(CRAWL_CONFIG['collections_path'],
                                    CRAWL_CONFIG['collection_name'])
            print(col_path)
            if os.path.exists(col_path):
                shutil.rmtree(col_path)

            crawl.start(CRAWL_CONFIG)

        SCRAPE_CONFIG = CRAWL_CONFIG.copy()

        SCRAPE_CONFIG.update({
            'template_proportion': 0.4,
            'max_templates': 100,
        })

        skindex = Scraper(SCRAPE_CONFIG)

        skindex.load_local_pages()
        skindex.add_template_elements()

        res = skindex.process_all(remove_visuals=True,
                                  maxn=CRAWL_CONFIG['max_saved_responses'])

        items = []
        for num, url in enumerate(res):
            if num == CRAWL_CONFIG['max_saved_responses']:
                break
            dc = res[url]
            dc['url'] = url
            dc['source_name'] = domain
            dc['images'] = [x for x in reversed(dc['images'][:5])]
            # dc['blobs'] = [TextBlob(x) for x in dc['body'] if dc['body']]
            items.append(dc)

        # this is quite out of place like this
        print('num unique images', len(get_image_set({x['url']: x for x in items})))

        if items and 'money' in items[0]:
            items = sorted(items, key=lambda x: len(x['money']), reverse=True)

        self.render('page_template.html', items=items, cached=False)

Example #3

Show file

# Crawling
CRAWL_CONFIG = DEFAULT_CRAWL_CONFIG
CRAWL_CONFIG.update({
    'seed_urls': ['http://www.techcrunch.com/'],
    'collections_path': '/Users/pascal/egoroot/sky_collections',
    'collection_name': 'techie',

    # Optional
    'crawl_filter_regexps': [],
    'crawl_required_regexps': ['2015', '2014'],
    'index_filter_regexps': [],
    'index_required_regexps': ['2015', '2014'],
    'max_saved_responses': 100,
    'max_workers': 10,
})

crawl.start(CRAWL_CONFIG)

# Indexing

SCRAPE_CONFIG = CRAWL_CONFIG.copy()

SCRAPE_CONFIG.update({'template_proportion': 0.09, 'max_templates': 1000})

skindex = Scraper(SCRAPE_CONFIG)

skindex.load_local_pages()
skindex.add_template_elements()

res = skindex.process_all(remove_visuals=True)

Example #4

Show file

 def __init__(self, config, cache=None):
     super(NewsCrawler, self).__init__(config, cache)
     self.scraper = Scraper(config)
     self.template_complete = False
     self.data = {}
     self.templates_done = 0