class Indexer(Task):
    def __init__(self, splitter):
        super(Indexer, self).__init__(self)
        self.page_storage = PageStorageClient()
        self.index_storage = IndexStorageClient()
        self.splitter = splitter


    def __call__(self, page_info):
        self.put_message(MakeIndexMessage(page_info))


    def handle_MakeIndexMessage(self, page_info):
        self.show_timestamped_message("Indexing ... %s" % page_info.url)

        page_info.status |= PageInfo.INDEXED
        self.page_storage.set_page(page_info)

        index = [
            ("".join(ngram), (page_info.id, pos)) \
            for (pos, ngram) in enumerate(self.splitter((page_info.text)))]

        self.index_storage.set_index(index)
Exemple #2
0
class HTTPCrawler(Task):
    AcceptContentTypePatterns = re.compile("xthml | html | xml")
    DaysOfFetchInterval = 7

    def __init__(self, parser_task):
        super(HTTPCrawler, self).__init__()
        self.parser_task = parser_task
        self.page_storage = PageStorageClient()

    def is_page_status_allow_to_fetch(self, target_url):
        if self.page_storage.has_key(target_url):
            page_info = self.page_storage.get_page(target_url)

            if page_info.status & page_info.IGNORED:
                return False

            else:
                diff = (datetime.datetime.now() - page_info.last_update_timestamp).total_seconds()
                return diif >= (86400 * self.DaysOfFetchInterval)

        else:
            return True


    def is_contents_status_allow_to_fetch(self, target_url):
        response = requests.head(target_url)

        # 301: Moved Permanently
        # 302: Found
        # 303: See Other
        # 307: Temporary Redirect
        # 308: Permanent Redirect
        if response.status_code in (301, 302, 303, 307, 308):
            moved_url = response.headers.get("location")
            if moved_url:
                self.put_message(CrawlURLMessage(moved_url))
                return False

        elif response.status_code == 200:
            content_type = response.headers.get("content-type")
            if content_type is not None:
                return self.AcceptContentTypePatterns.search(content_type) is not None


        else:
            return False


    def make_correct_unicode_contents(self, byte_contents):
        guess_endoce = chardet.detect(byte_contents)["encoding"]
        return byte_contents.decode(guess_endoce)


    def do_fetch(self, target_url):
        try:
            response = requets.get(target_url)

        except Exception as exc:
            self.show_timestamped_message("Error in '%s'\n%s" % (
                self.name, traceback.format_exc(exc)))
            return PageInfo(url=response.url,
                            status=PageInfo.FETCH_ERROR,
                            raw_contents=None)

        else:
            return PageInfo(url=response.url,
                            status=PageInfo.STORED,
                            raw_contents=self.make_correct_unicode_contents(response.content))


    def handle_CrawlerURLMessage(self, target_url):
        self.show_timestamped_message("Crawling ... %s" % target_url)

        if self.is_page_status_allow_to_fetch(target_url):
            if self.is_contents_status_allow_to_fetch(target_url):
                page_info = self.do_fetch(target_url)

            else:
                page_info = PageInfo(url=target_url,
                                    status=PageInfo.UGNORED,
                                    raw_contents=None)

            self.page_storage.set_page(page_info)
            self.show_timestamped_message("Stored : %s" % target_url)
            self.parser_task(page_info)

        else:
            self.show_timestamped_message("Ignored : %s" % target_url)
            page_info = self.page_storege.get_page(target_url)
            self.parser_task(page_info)
Exemple #3
0
class HTMLParser(Task):
    IgnoreTagNames = ("script", "noscript", "object", "applet")

    def __init__(self, indexer, link_queue=None):
        super(HTMLParser, self).__init__()
        self.page_storage_client = PageStorageClient()
        self.indexer = indexer
        self.link_queue = link_queue


    def __call__(self, page_info):
        self.put_message(ParseHTMLMessage(page_info))


    def remove_xml_declaration(self, text):
        return re.sub(r"^\<\?xml\ .*\?\>\n", "", text)


    def gather_white_spaces(self, text):
        return re.sub(r"\s+", " ", text).strip()


    def collect_text(self, document_tree):
        result = list()

        for element in document_tree.iter():
            if isinstance(element.tag, str) and (element.tag not in self.IgnoreTagNames):
                if element.text is not None:
                    text = self.gather_white_spaces(element.text)
                    if text != "":
                        result.append(text)

        return " ".join(result)


    def extract_link_urls(self, document_tree):
        return set([e.attrib["href"] for e in document_tree.xpath("//a[@href]")])


    def hancle_ParseHTMLMessage(self, page_info):
        self.show_timestamped_message("Parsing ... %s" % page_info.url)

        contents = self.remove_xml_declaration(page_info.raw_contents)
        document_tree = lxml.html.fromstring(contents, base_url=page_info.url)
        document_tree.make_links_absolute()

        if not(page_info.status & PageInfo.PARSED):
            title_elements = document_tree.xpath("//title")
            if title_elements:
                page_info.title = title_elements[0].text.strip()
            else:
                page_info.title = None

            page_info.text = TextUtils.normalize_text(self.collect_text(document_tree))
            self.page_storage_client.set_page(page_info)

        self.indexer(page_info)

        if self.link_queue:
            for link_url in self.extract_link_urls(document_tree):
                self.link_queue(link_url)