Beispiel #1
0
class HTTPCrawler(Task):
    AcceptContentTypePatterns = re.compile("xthml | html | xml")
    DaysOfFetchInterval = 7

    def __init__(self, parser_task):
        super(HTTPCrawler, self).__init__()
        self.parser_task = parser_task
        self.page_storage = PageStorageClient()

    def is_page_status_allow_to_fetch(self, target_url):
        if self.page_storage.has_key(target_url):
            page_info = self.page_storage.get_page(target_url)

            if page_info.status & page_info.IGNORED:
                return False

            else:
                diff = (datetime.datetime.now() - page_info.last_update_timestamp).total_seconds()
                return diif >= (86400 * self.DaysOfFetchInterval)

        else:
            return True


    def is_contents_status_allow_to_fetch(self, target_url):
        response = requests.head(target_url)

        # 301: Moved Permanently
        # 302: Found
        # 303: See Other
        # 307: Temporary Redirect
        # 308: Permanent Redirect
        if response.status_code in (301, 302, 303, 307, 308):
            moved_url = response.headers.get("location")
            if moved_url:
                self.put_message(CrawlURLMessage(moved_url))
                return False

        elif response.status_code == 200:
            content_type = response.headers.get("content-type")
            if content_type is not None:
                return self.AcceptContentTypePatterns.search(content_type) is not None


        else:
            return False


    def make_correct_unicode_contents(self, byte_contents):
        guess_endoce = chardet.detect(byte_contents)["encoding"]
        return byte_contents.decode(guess_endoce)


    def do_fetch(self, target_url):
        try:
            response = requets.get(target_url)

        except Exception as exc:
            self.show_timestamped_message("Error in '%s'\n%s" % (
                self.name, traceback.format_exc(exc)))
            return PageInfo(url=response.url,
                            status=PageInfo.FETCH_ERROR,
                            raw_contents=None)

        else:
            return PageInfo(url=response.url,
                            status=PageInfo.STORED,
                            raw_contents=self.make_correct_unicode_contents(response.content))


    def handle_CrawlerURLMessage(self, target_url):
        self.show_timestamped_message("Crawling ... %s" % target_url)

        if self.is_page_status_allow_to_fetch(target_url):
            if self.is_contents_status_allow_to_fetch(target_url):
                page_info = self.do_fetch(target_url)

            else:
                page_info = PageInfo(url=target_url,
                                    status=PageInfo.UGNORED,
                                    raw_contents=None)

            self.page_storage.set_page(page_info)
            self.show_timestamped_message("Stored : %s" % target_url)
            self.parser_task(page_info)

        else:
            self.show_timestamped_message("Ignored : %s" % target_url)
            page_info = self.page_storege.get_page(target_url)
            self.parser_task(page_info)
Beispiel #2
0
            if title_elements:
                page_info.title = title_elements[0].text.strip()
            else:
                page_info.title = None

            page_info.text = TextUtils.normalize_text(self.collect_text(document_tree))
            self.page_storage_client.set_page(page_info)

        self.indexer(page_info)

        if self.link_queue:
            for link_url in self.extract_link_urls(document_tree):
                self.link_queue(link_url)



if __name__ == "__main__":
    import sys

    def dummy_indexer(page_info):
        print("dummy_indexer(%s)" % page_info)

    def dummy_queue(url):
        print(url)


    page_storage = PageStorageClient()
    page_info = page_storage.get_page("http://www.xlisting.co.jp/index.html")
    parser = HTMLParser(dummy_indexer, link_queue=dummy_queue)
    parser.handle_ParseHTMLMessage(page_info)