Ejemplo n.º 1
0
class CrawlStrategy(object):
    def __init__(self):
        self.canonicalsolver = BasicCanonicalSolver()

    def add_seeds(self, seeds):
        scores = {}
        for seed in seeds:
            if seed.meta['state'] is None:
                url, fingerprint, _ = self.canonicalsolver.get_canonical_url(seed)
                scores[fingerprint] = 1.0
                seed.meta['state'] = _state.get_id('QUEUED')
        return scores

    def page_crawled(self, response, links):
        scores = {}
        response.meta['state'] = _state.get_id('CRAWLED')
        for link in links:
            if link.meta['state'] is None:
                url, fingerprint, _ = self.canonicalsolver.get_canonical_url(link)
                scores[fingerprint] = self.get_score(url)
                link.meta['state'] = _state.get_id('QUEUED')
        return scores

    def page_error(self, request, error):
        url, fingerprint, _ = self.canonicalsolver.get_canonical_url(request)
        request.meta['state'] = _state.get_id('ERROR')
        return {fingerprint: 0.0}

    def finished(self):
        return False

    def get_score(self, url):
        url_parts = urlparse(url)
        path_parts = url_parts.path.split('/')
        return 1.0 / (max(len(path_parts), 1.0) + len(url_parts.path)*0.1)
Ejemplo n.º 2
0
class CrawlStrategy(object):
    def __init__(self):
        self.canonicalsolver = BasicCanonicalSolver()

    def add_seeds(self, seeds):
        scores = {}
        for seed in seeds:
            if seed.meta['state'] is None:
                url, fingerprint, _ = self.canonicalsolver.get_canonical_url(
                    seed)
                scores[fingerprint] = 1.0
                seed.meta['state'] = _state.get_id('QUEUED')
        return scores

    def page_crawled(self, response, links):
        scores = {}
        response.meta['state'] = _state.get_id('CRAWLED')
        for link in links:
            if link.meta['state'] is None:
                url, fingerprint, _ = self.canonicalsolver.get_canonical_url(
                    link)
                scores[fingerprint] = self.get_score(url)
                link.meta['state'] = _state.get_id('QUEUED')
        return scores

    def page_error(self, request, error):
        url, fingerprint, _ = self.canonicalsolver.get_canonical_url(request)
        request.meta['state'] = _state.get_id('ERROR')
        return {fingerprint: 0.0}

    def finished(self):
        return False

    def get_score(self, url):
        url_parts = urlparse(url)
        path_parts = url_parts.path.split('/')
        return 1.0 / (max(len(path_parts), 1.0) + len(url_parts.path) * 0.1)
Ejemplo n.º 3
0
 def __init__(self):
     self.canonicalsolver = BasicCanonicalSolver()
Ejemplo n.º 4
0
class CrawlStrategy(object):

    S_QUEUED = _state.get_id('QUEUED')
    S_NOT_CRAWLED = _state.get_id('NOT_CRAWLED')
    S_ERROR = _state.get_id('ERROR')
    fetch_limit = 100

    def __init__(self):
        self.canonicalsolver = BasicCanonicalSolver()
        self.content_processor = ContentProcessor()
        self.results = {}
        self.results_collected = 0

    def configure(self, config):
        self.classifier = TopicClassifier.from_keywords(config['included'], config['excluded'])
        self.fetch_limit = config['nResults']
        self.results = {}
        self.stats = {
            'downloaded': 0,
            'errors': 0,
            'scheduled': 0
        }
        self.results_collected = 0

    def add_seeds(self, seeds):
        scores = {}
        for seed in seeds:
            if seed.meta['state'] is None:
                url, fingerprint, _ = self.canonicalsolver.get_canonical_url(seed)
                scores[fingerprint] = 1.0
                seed.meta['state'] = self.S_QUEUED
        return scores

    def page_crawled(self, response, links):
        scores = {}
        response.meta['state'] = _state.get_id('CRAWLED')
        url, fingerprint, _ = self.canonicalsolver.get_canonical_url(response)

        if 'p_score' not in response.meta:
            drill_down = False
        else:
            score = response.meta['p_score']
            drill_down = self.classifier.classify_paragraphs(score)
            if drill_down:
                self.results[fingerprint] = [
                    score,
                    url,
                    response.meta['title'],
                    response.meta['descr'],
                    response.meta['keywords'],
                ]
                self.results_collected += 1

        scheduled = 0
        for link in links:
            if link.meta['state'] is None:
                url, fingerprint, _ = self.canonicalsolver.get_canonical_url(link)
                if drill_down:
                    url_parts = urlparse(url)
                    path_parts = url_parts.path.split('/')
                    scores[fingerprint] = 1.0 / (len(path_parts) + len(url_parts.path)*0.1)
                    link.meta['state'] = self.S_QUEUED
                else:
                    scores[fingerprint] = None
                    link.meta['state'] = self.S_NOT_CRAWLED
                scheduled += 1
        self.stats['downloaded'] += 1
        self.stats['scheduled'] += scheduled
        return scores

    def page_error(self, request, error):
        url, fingerprint, _ = self.canonicalsolver.get_canonical_url(request)
        request.meta['state'] = self.S_ERROR
        self.stats['errors'] += 1
        return {fingerprint: 0.0}

    def finished(self):
        return self.results_collected > self.fetch_limit
Ejemplo n.º 5
0
 def __init__(self):
     self.canonicalsolver = BasicCanonicalSolver()
     self.content_processor = ContentProcessor()
     self.results = {}
     self.results_collected = 0
Ejemplo n.º 6
0
 def __init__(self):
     self.canonicalsolver = BasicCanonicalSolver()