Ejemplo n.º 1
0
    def parse_item(self, response):
        meta = response.request.meta
        source = meta['domain']
        url = response.request.url
        if self.sourcelinkprocessor_class:
            processor = self.sourcelinkprocessor_class()
            url = processor.process(url)

        if not self._process_response(response, source, LinkType.LEAF):
            service.report_status([LinkStatus(meta['redirect_urls'][0], source, Status.FAIL, type)])
            career.remove_item(url, source)
            return

        if not self.itemloader_class:
            return

        try:
            selector = HtmlXPathSelector(response)
            loader = self.itemloader_class(selector)
            loader.add_value('source', source)
            loader.add_value('source_link', url)
        except Exception, e:
            service.report_status([LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)])
            print e, url
            log_error(url)
Ejemplo n.º 2
0
    def _process_response(self, response, source, type):
        '''
        Returns True if response can be further processed otherwise False.
        '''
        url = response.request.url
        if self.sourcefilterprocessor_class:
            processor = self.sourcefilterprocessor_class()
            url = processor.process(self, url)

        if url is None:
            service.report_status([LinkStatus(url, source, Status.FAIL, type)])
            return False

        if self.sourcelinkprocessor_class:
            processor = self.sourcelinkprocessor_class()
            url = processor.process(url)

        if url is None:
            service.report_status([LinkStatus(url, source, Status.FAIL, type)])
            return False

        if response.status == 200:
            service.report_status([LinkStatus(url, source, Status.SUCCEED, type)])
            return True
        else:
            service.report_status([LinkStatus(url, source, Status.FAIL, type)])
            return False
Ejemplo n.º 3
0
    def parse(self, response):
        meta = response.request.meta
        source = meta['domain']
        all_link = []
        url = response.request.url
        if not self._process_response(response, source, LinkType.CATELOG):
            return

        rule_dicts = meta['rules']
        rules = self._get_rules(source, rule_dicts)

        for rule in rules:
            links = [l for l in rule.link_extractor.extract_links(response)]
            if links and rule.process_links:
                links = rule.process_links(links)

            for link in links:
                if link not in all_link:
                    all_link.append(link)

        if all_link:
            service.report_status([LinkStatus(link.url, source, Status.FOUND, rule.link_type) for link in all_link])
            service.report_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(all_link))])