class ScoreSpider(Spider):
    name = 'score'

    def __init__(self, *args, **kwargs):
        super(ScoreSpider, self).__init__(*args, **kwargs)
        self.contentprocessor = ContentProcessor(skip_text=False)
        self.job_config = {'disabled': True}
        self.classifier = None

    def set_process_info(self, process_info):
        self.process_info = process_info
        self.zk.set(process_info)

    def configure(self, job_config):
        self.job_config = job_config
        if 'disabled' not in job_config:
            self.classifier = TopicClassifier.from_keywords(
                job_config['included'], job_config['excluded'])

    # stable branch
    def set_crawler(self, crawler):
        super(ScoreSpider, self).set_crawler(crawler)
        self.crawler.signals.connect(self.spider_idle,
                                     signal=signals.spider_idle)
        self.zk = ZookeeperSession(self.settings.get('ZOOKEEPER_LOCATION'),
                                   name_prefix='spider')
        self.jsonrpc_server = TopicalSpiderWebService(self, self.settings)
        self.jsonrpc_server.start_listening()

    def spider_idle(self):
        self.log("Spider idle signal caught.")
        raise DontCloseSpider

    def make_requests_from_url(self, url):
        r = super(ScoreSpider, self).make_requests_from_url(url)
        return r

    def parse(self, response):
        pc = self.contentprocessor.process_response(response)
        if not pc:
            return
        if self.classifier:
            response.meta['p_score'] = self.classifier.score_paragraphs(
                pc.paragraphs)
            response.meta['title'] = pc.title
            response.meta['descr'] = pc.meta_description
            response.meta['keywords'] = pc.meta_keywords
        for link in pc.links:
            r = Request(url=link.url)
            r.meta.update(link_text=link.text)
            yield r
class ScoreSpider(Spider):
    name = 'score'

    def __init__(self, *args, **kwargs):
        super(ScoreSpider, self).__init__(*args, **kwargs)
        self.contentprocessor = ContentProcessor(skip_text=False)
        self.job_config = {'disabled': True}
        self.classifier = None

    def set_process_info(self, process_info):
        self.process_info = process_info
        self.zk.set(process_info)

    def configure(self, job_config):
        self.job_config = job_config
        if 'disabled' not in job_config:
            self.classifier = TopicClassifier.from_keywords(job_config['included'], job_config['excluded'])

    # stable branch
    def set_crawler(self, crawler):
        super(ScoreSpider, self).set_crawler(crawler)
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.zk = ZookeeperSession(self.settings.get('ZOOKEEPER_LOCATION'), name_prefix='spider')
        self.jsonrpc_server = TopicalSpiderWebService(self, self.settings)
        self.jsonrpc_server.start_listening()


    def spider_idle(self):
        self.log("Spider idle signal caught.")
        raise DontCloseSpider

    def make_requests_from_url(self, url):
        r = super(ScoreSpider, self).make_requests_from_url(url)
        return r

    def parse(self, response):
        pc = self.contentprocessor.process_response(response)
        if not pc:
            return
        if self.classifier:
            response.meta['p_score'] = self.classifier.score_paragraphs(pc.paragraphs)
            response.meta['title'] = pc.title
            response.meta['descr'] = pc.meta_description
            response.meta['keywords'] = pc.meta_keywords
        for link in pc.links:
            r = Request(url=link.url)
            r.meta.update(link_text=link.text)
            yield r
 def __init__(self, *args, **kwargs):
     super(ScoreSpider, self).__init__(*args, **kwargs)
     self.contentprocessor = ContentProcessor(skip_text=False)
     self.job_config = {'disabled': True}
     self.classifier = None
 def __init__(self, *args, **kwargs):
     super(ScoreSpider, self).__init__(*args, **kwargs)
     self.contentprocessor = ContentProcessor(skip_text=False)
     self.job_config = {'disabled': True}
     self.classifier = None