Esempio n. 1
0
    def scrape_data(self):
        # Create boilerplate recognizer
        skindex = Scraper(self.crawl_config)

        skindex.load_local_pages()
        skindex.add_template_elements()

        # Process all by removing boilerplate and extracting information
        return skindex.process_all(exclude_data=['cleaned', 'author'])
Esempio n. 2
0
    def scrape_data(self):
        # Create boilerplate recognizer
        skindex = Scraper(self.crawl_config)

        skindex.load_local_pages()
        skindex.add_template_elements()

        # Process all by removing boilerplate and extracting information
        return skindex.process_all(exclude_data=['cleaned', 'author'])
Esempio n. 3
0
class NewsCrawler(Crawler):
    def __init__(self, config, cache=None):
        super(NewsCrawler, self).__init__(config, cache)
        self.scraper = Scraper(config)
        self.template_complete = False
        self.data = {}
        self.templates_done = 0

    async def save_response(self, html_code, url, headers, crawl_date):
        # f*****g mess
        try:
            # just let the indexer save the files as normal and also create a Template
            url = url
            tree = makeTree(html_code, self.scraper.domain)
            if self.templates_done < self.scraper.config["max_templates"]:
                self.templates_done += 1
                self.scraper.domain_nodes_dict.add_template_elements(tree)
                self.scraper.url_to_headers_mapping[url] = headers
            self.data[url] = self.scraper.process(url, tree, False,
                                                  ["cleaned"])
            self.data[url]["crawl_date"] = crawl_date
            scrape_date = time.strftime("%Y-%m-%dT%H:%M:%S",
                                        time.localtime(time.time()))
            self.data[url]["scrape_date"] = scrape_date
        except Exception as e:
            LOGGER.error(
                "CRITICAL ERROR IN SCRAPER for url %r: %r, stack %r",
                url,
                str(e),
                traceback.format_exc(),
            )
        return

    def save_data(self, data):
        raise NotImplementedError("save_data has to be implemented")

    def save_bulk_data(self, data):
        raise NotImplementedError("save_bulk_data has to be implemented")

    def finish_leftovers(self):
        LOGGER.info("finish leftovers")
        if self.data:
            image_set = get_image_set(self.data)
            LOGGER.info("saving number of documents: %r", len(self.data))
            LOGGER.info("found num unique images: %r", len(image_set))
            LOGGER.info("saving status code: %r",
                        self.save_bulk_data(self.data))
        return dict(self.scraper.domain_nodes_dict)
Esempio n. 4
0
class NewsCrawler(Crawler):

    def __init__(self, config, cache=None):
        super(NewsCrawler, self).__init__(config, cache)
        self.scraper = Scraper(config)
        self.template_complete = False
        self.data = {}
        self.templates_done = 0

    @asyncio.coroutine
    def save_response(self, html_code, url, headers, crawl_date):
        # f*****g mess
        try:
            # just let the indexer save the files as normal and also create a Template
            url = url
            tree = makeTree(html_code, self.scraper.domain)
            if self.templates_done < self.scraper.config['max_templates']:
                self.templates_done += 1
                self.scraper.domain_nodes_dict.add_template_elements(tree)
                self.scraper.url_to_headers_mapping[url] = headers
            self.data[url] = self.scraper.process(url, tree, False, ['cleaned'])
            self.data[url]['crawl_date'] = crawl_date
            scrape_date = time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime(time.time()))
            self.data[url]['scrape_date'] = scrape_date
        except Exception as e:
            LOGGER.error("CRITICAL ERROR IN SCRAPER for url %r: %r, stack %r",
                         url, str(e), traceback.format_exc())
        return

    def save_data(self, data):
        raise NotImplementedError('save_data has to be implemented')

    def save_bulk_data(self, data):
        raise NotImplementedError('save_bulk_data has to be implemented')

    def finish_leftovers(self):
        LOGGER.info('finish leftovers')
        if self.data:
            image_set = get_image_set(self.data)
            LOGGER.info('saving number of documents: %r', len(self.data))
            LOGGER.info('found num unique images: %r', len(image_set))
            LOGGER.info('saving status code: %r', self.save_bulk_data(self.data))
        return dict(self.scraper.domain_nodes_dict)
Esempio n. 5
0
    'index_filter_regexps': [

    ],

    'index_required_regexps': [
        '2015', '2014'
    ],

    'max_saved_responses': 100,

    'max_workers': 10,
})

crawl.start(CRAWL_CONFIG)

# Indexing

SCRAPE_CONFIG = CRAWL_CONFIG.copy()

SCRAPE_CONFIG.update({
    'template_proportion': 0.09,
    'max_templates': 1000
})

skindex = Scraper(SCRAPE_CONFIG)

skindex.load_local_pages()
skindex.add_template_elements()

res = skindex.process_all(remove_visuals=True)
Esempio n. 6
0
# Crawling
CRAWL_CONFIG = DEFAULT_CRAWL_CONFIG
CRAWL_CONFIG.update({
    'seed_urls': ['http://www.techcrunch.com/'],
    'collections_path': '/Users/pascal/egoroot/sky_collections',
    'collection_name': 'techie',

    # Optional
    'crawl_filter_regexps': [],
    'crawl_required_regexps': ['2015', '2014'],
    'index_filter_regexps': [],
    'index_required_regexps': ['2015', '2014'],
    'max_saved_responses': 100,
    'max_workers': 10,
})

crawl.start(CRAWL_CONFIG)

# Indexing

SCRAPE_CONFIG = CRAWL_CONFIG.copy()

SCRAPE_CONFIG.update({'template_proportion': 0.09, 'max_templates': 1000})

skindex = Scraper(SCRAPE_CONFIG)

skindex.load_local_pages()
skindex.add_template_elements()

res = skindex.process_all(remove_visuals=True)
Esempio n. 7
0
File: view.py Progetto: bgarrels/sky
    def post(self):
        CRAWL_CONFIG = DEFAULT_CRAWL_CONFIG
        CRAWL_CONFIG.update({
            'collections_path': os.path.join(os.path.expanduser('~'), 'sky_collections/'),
            # 'max_workers': 10,
        })
        args = self.request.arguments
        print(args)
        for arg in args:
            value = args[arg][0].decode('utf8')
            if value and arg != 'url' and arg != 'checkboxcache':
                print('pre', arg, CRAWL_CONFIG[arg])
                if isinstance(CRAWL_CONFIG[arg], list):
                    CRAWL_CONFIG[arg] = [int(value)] if is_numeric(value) else [value]
                else:
                    CRAWL_CONFIG[arg] = int(value) if is_numeric(value) else value
                print('post', arg, CRAWL_CONFIG[arg])

        url = self.get_argument('url', '')

        use_cache = self.get_argument('checkboxcache', '')

        domain = extractDomain(url)
        CRAWL_CONFIG['seed_urls'] = [url]
        CRAWL_CONFIG['collection_name'] = domain[7:]

        if use_cache != 'on':

            col_path = os.path.join(CRAWL_CONFIG['collections_path'],
                                    CRAWL_CONFIG['collection_name'])
            print(col_path)
            if os.path.exists(col_path):
                shutil.rmtree(col_path)

            crawl.start(CRAWL_CONFIG)

        SCRAPE_CONFIG = CRAWL_CONFIG.copy()

        SCRAPE_CONFIG.update({
            'template_proportion': 0.4,
            'max_templates': 100
        })

        skindex = Scraper(SCRAPE_CONFIG)

        skindex.load_local_pages()
        skindex.add_template_elements()

        res = skindex.process_all(remove_visuals=True,
                                  maxn=CRAWL_CONFIG['max_saved_responses'])

        items = []
        for num, url in enumerate(res):
            if num == CRAWL_CONFIG['max_saved_responses']:
                break
            dc = res[url]
            dc['url'] = url
            dc['source_name'] = domain
            dc['images'] = [x for x in reversed(dc['images'][:5])]
            # dc['blobs'] = [TextBlob(x) for x in dc['body'] if dc['body']]
            items.append(dc)

        # this is quite out of place like this
        print('num unique images', len(get_image_set({x['url']: x for x in items})))

        if items and 'money' in items[0]:
            items = sorted(items, key=lambda x: len(x['money']), reverse=True)

        self.render('page_template.html', items=items, cached=False)
Esempio n. 8
0
    def post(self):
        CRAWL_CONFIG = DEFAULT_CRAWL_CONFIG.copy()
        CRAWL_CONFIG.update(
            {
                "collections_path": os.path.join(os.path.expanduser("~"), "sky_view_collections/"),
                # 'max_workers': 10,
            }
        )
        args = self.request.arguments
        print(args)
        for arg in args:
            value = args[arg][0].decode("utf8")
            if value and arg != "url" and arg != "checkboxcache":
                print("pre", arg, CRAWL_CONFIG[arg])
                if isinstance(CRAWL_CONFIG[arg], list):
                    CRAWL_CONFIG[arg] = [int(value)] if is_numeric(value) else value.split(", ")
                else:
                    CRAWL_CONFIG[arg] = int(value) if is_numeric(value) else value.split(", ")[0]
                print("post", arg, CRAWL_CONFIG[arg])

        url = self.get_argument("url", "")

        use_cache = self.get_argument("checkboxcache", "")

        domain = extractDomain(url)
        CRAWL_CONFIG["seed_urls"] = [url]
        CRAWL_CONFIG["collection_name"] = domain[7:]

        if use_cache != "on":

            col_path = os.path.join(CRAWL_CONFIG["collections_path"], CRAWL_CONFIG["collection_name"])
            print(col_path)
            if os.path.exists(col_path):
                shutil.rmtree(col_path)

            crawl.start(CRAWL_CONFIG)

        SCRAPE_CONFIG = CRAWL_CONFIG.copy()

        SCRAPE_CONFIG.update({"template_proportion": 0.4, "max_templates": 100})

        skindex = Scraper(SCRAPE_CONFIG)

        skindex.load_local_pages()
        skindex.add_template_elements()

        res = skindex.process_all(remove_visuals=True, maxn=CRAWL_CONFIG["max_saved_responses"])

        items = []
        for num, url in enumerate(res):
            if num == CRAWL_CONFIG["max_saved_responses"]:
                break
            dc = res[url]
            dc["url"] = url
            dc["source_name"] = domain
            dc["images"] = [x for x in reversed(dc["images"][:5])]
            # dc['blobs'] = [TextBlob(x) for x in dc['body'] if dc['body']]
            items.append(dc)

        # this is quite out of place like this
        print("num unique images", len(get_image_set({x["url"]: x for x in items})))

        if items and "money" in items[0]:
            items = sorted(items, key=lambda x: len(x["money"]), reverse=True)

        self.render("page_template.html", items=items, cached=False)
Esempio n. 9
0
    def post(self):
        CRAWL_CONFIG = DEFAULT_CRAWL_CONFIG.copy()
        CRAWL_CONFIG.update({
            'collections_path': os.path.join(os.path.expanduser('~'), 'sky_view_collections/'),
            # 'max_workers': 10,
        })
        args = self.request.arguments
        print(args)
        for arg in args:
            value = args[arg][0].decode('utf8')
            if value and arg != 'url' and arg != 'checkboxcache':
                print('pre', arg, CRAWL_CONFIG[arg])
                if isinstance(CRAWL_CONFIG[arg], list):
                    CRAWL_CONFIG[arg] = [int(value)] if is_numeric(value) else value.split(', ')
                else:
                    CRAWL_CONFIG[arg] = int(value) if is_numeric(value) else value.split(', ')[0]
                print('post', arg, CRAWL_CONFIG[arg])

        url = self.get_argument('url', '')

        use_cache = self.get_argument('checkboxcache', '')

        domain = extractDomain(url)
        CRAWL_CONFIG['seed_urls'] = [url]
        if domain.startswith("http"):
            CRAWL_CONFIG['collection_name'] = domain.split("/")[2]
        else:
            CRAWL_CONFIG['collection_name'] = domain.split("/")[0]

        if use_cache != 'on':

            col_path = os.path.join(CRAWL_CONFIG['collections_path'],
                                    CRAWL_CONFIG['collection_name'])
            print(col_path)
            if os.path.exists(col_path):
                shutil.rmtree(col_path)

            crawl.start(CRAWL_CONFIG)

        SCRAPE_CONFIG = CRAWL_CONFIG.copy()

        SCRAPE_CONFIG.update({
            'template_proportion': 0.4,
            'max_templates': 100,
        })

        skindex = Scraper(SCRAPE_CONFIG)

        skindex.load_local_pages()
        skindex.add_template_elements()

        res = skindex.process_all(remove_visuals=True,
                                  maxn=CRAWL_CONFIG['max_saved_responses'])

        items = []
        for num, url in enumerate(res):
            if num == CRAWL_CONFIG['max_saved_responses']:
                break
            dc = res[url]
            dc['url'] = url
            dc['source_name'] = domain
            dc['images'] = [x for x in reversed(dc['images'][:5])]
            # dc['blobs'] = [TextBlob(x) for x in dc['body'] if dc['body']]
            items.append(dc)

        # this is quite out of place like this
        print('num unique images', len(get_image_set({x['url']: x for x in items})))

        if items and 'money' in items[0]:
            items = sorted(items, key=lambda x: len(x['money']), reverse=True)

        self.render('page_template.html', items=items, cached=False)
Esempio n. 10
0
 def __init__(self, config, cache=None):
     super(NewsCrawler, self).__init__(config, cache)
     self.scraper = Scraper(config)
     self.template_complete = False
     self.data = {}
     self.templates_done = 0
Esempio n. 11
0
 def __init__(self, config, cache=None):
     super(NewsCrawler, self).__init__(config, cache)
     self.scraper = Scraper(config)
     self.template_complete = False
     self.data = {}
     self.templates_done = 0