Ejemplo n.º 1
0
class ScoreSpider(Spider):
    name = 'score'

    def __init__(self, *args, **kwargs):
        super(ScoreSpider, self).__init__(*args, **kwargs)
        self.contentprocessor = ContentProcessor(skip_text=False)
        self.job_config = {'disabled': True}
        self.classifier = None
        self.result_cb = None

    def set_process_info(self, process_info):
        self.process_info = process_info

    def set_result_callback(self, func):
        self.result_cb = func

    def configure(self, job_config):
        self.job_config = job_config
        if 'disabled' not in job_config:
            self.classifier = TopicClassifier.from_keywords(
                job_config['included'], job_config['excluded'])

    def spider_idle(self):
        self.log("Spider idle signal caught.")
        raise DontCloseSpider

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = cls(*args, **kwargs)
        spider._set_crawler(crawler)
        spider.crawler.signals.connect(spider.spider_idle,
                                       signal=signals.spider_idle)
        return spider

    def make_requests_from_url(self, url):
        r = super(ScoreSpider, self).make_requests_from_url(url)
        r.meta['score'] = self.get_score(r)
        return r

    def get_score(self, r):
        url_parts = urlparse_cached(r)
        path_parts = url_parts.path.split('/')
        return 1.0 / (len(path_parts) + 0.05 * len(r.url))

    def parse(self, response):
        pc = self.contentprocessor.process_response(response)
        if not pc:
            return
        if not self.classifier:
            raise Exception("Classifier not configured")

        response.meta['p_score'] = self.classifier.score_paragraphs(
            pc.paragraphs)
        response.meta['title'] = pc.title
        response.meta['descr'] = pc.meta_description
        response.meta['keywords'] = pc.meta_keywords

        if response.meta['p_score'] > 0:
            self.result_cb({
                'score': response.meta['p_score'],
                'url': response.url,
                'title': response.meta['title'],
                'descr': response.meta['descr'],
                'keywords': response.meta['keywords']
            })

        for link in pc.links:
            r = Request(url=link.url)
            r.meta.update(link_text=link.text)
            r.meta['score'] = self.get_score(r)
            yield r
Ejemplo n.º 2
0
class ScoreSpider(Spider):
    name = 'score'

    def __init__(self, *args, **kwargs):
        super(ScoreSpider, self).__init__(*args, **kwargs)
        self.contentprocessor = ContentProcessor(skip_text=False)
        self.job_config = {'disabled': True}
        self.classifier = None
        self.result_cb = None

    def set_process_info(self, process_info):
        self.process_info = process_info

    def set_result_callback(self, func):
        self.result_cb = func

    def configure(self, job_config):
        self.job_config = job_config
        if 'disabled' not in job_config:
            self.classifier = TopicClassifier.from_keywords(job_config['included'], job_config['excluded'])

    def spider_idle(self):
        self.log("Spider idle signal caught.")
        raise DontCloseSpider

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = cls(*args, **kwargs)
        spider._set_crawler(crawler)
        spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
        return spider

    def make_requests_from_url(self, url):
        r = super(ScoreSpider, self).make_requests_from_url(url)
        r.meta['score'] = self.get_score(r)
        return r

    def get_score(self, r):
        url_parts = urlparse_cached(r)
        path_parts = url_parts.path.split('/')
        return 1.0 / (len(path_parts) + 0.05*len(r.url))

    def parse(self, response):
        pc = self.contentprocessor.process_response(response)
        if not pc:
            return
        if not self.classifier:
            raise Exception("Classifier not configured")

        response.meta['p_score'] = self.classifier.score_paragraphs(pc.paragraphs)
        response.meta['title'] = pc.title
        response.meta['descr'] = pc.meta_description
        response.meta['keywords'] = pc.meta_keywords

        if response.meta['p_score'] > 0:
            self.result_cb({
                'score': response.meta['p_score'],
                'url': response.url,
                'title': response.meta['title'],
                'descr': response.meta['descr'],
                'keywords': response.meta['keywords']
            })

        for link in pc.links:
            r = Request(url=link.url)
            r.meta.update(link_text=link.text)
            r.meta['score'] = self.get_score(r)
            yield r
Ejemplo n.º 3
0
 def __init__(self, *args, **kwargs):
     super(ScoreSpider, self).__init__(*args, **kwargs)
     self.contentprocessor = ContentProcessor(skip_text=False)
     self.job_config = {'disabled': True}
     self.classifier = None
     self.result_cb = None
Ejemplo n.º 4
0
 def __init__(self, *args, **kwargs):
     super(ScoreSpider, self).__init__(*args, **kwargs)
     self.contentprocessor = ContentProcessor(skip_text=False)
     self.job_config = {'disabled': True}
     self.classifier = None
     self.result_cb = None