class Indexer(Task): def __init__(self, splitter): super(Indexer, self).__init__(self) self.page_storage = PageStorageClient() self.index_storage = IndexStorageClient() self.splitter = splitter def __call__(self, page_info): self.put_message(MakeIndexMessage(page_info)) def handle_MakeIndexMessage(self, page_info): self.show_timestamped_message("Indexing ... %s" % page_info.url) page_info.status |= PageInfo.INDEXED self.page_storage.set_page(page_info) index = [ ("".join(ngram), (page_info.id, pos)) \ for (pos, ngram) in enumerate(self.splitter((page_info.text)))] self.index_storage.set_index(index)
def __init__(self, parser_task): super(HTTPCrawler, self).__init__() self.parser_task = parser_task self.page_storage = PageStorageClient()
class HTTPCrawler(Task): AcceptContentTypePatterns = re.compile("xthml | html | xml") DaysOfFetchInterval = 7 def __init__(self, parser_task): super(HTTPCrawler, self).__init__() self.parser_task = parser_task self.page_storage = PageStorageClient() def is_page_status_allow_to_fetch(self, target_url): if self.page_storage.has_key(target_url): page_info = self.page_storage.get_page(target_url) if page_info.status & page_info.IGNORED: return False else: diff = (datetime.datetime.now() - page_info.last_update_timestamp).total_seconds() return diif >= (86400 * self.DaysOfFetchInterval) else: return True def is_contents_status_allow_to_fetch(self, target_url): response = requests.head(target_url) # 301: Moved Permanently # 302: Found # 303: See Other # 307: Temporary Redirect # 308: Permanent Redirect if response.status_code in (301, 302, 303, 307, 308): moved_url = response.headers.get("location") if moved_url: self.put_message(CrawlURLMessage(moved_url)) return False elif response.status_code == 200: content_type = response.headers.get("content-type") if content_type is not None: return self.AcceptContentTypePatterns.search(content_type) is not None else: return False def make_correct_unicode_contents(self, byte_contents): guess_endoce = chardet.detect(byte_contents)["encoding"] return byte_contents.decode(guess_endoce) def do_fetch(self, target_url): try: response = requets.get(target_url) except Exception as exc: self.show_timestamped_message("Error in '%s'\n%s" % ( self.name, traceback.format_exc(exc))) return PageInfo(url=response.url, status=PageInfo.FETCH_ERROR, raw_contents=None) else: return PageInfo(url=response.url, status=PageInfo.STORED, raw_contents=self.make_correct_unicode_contents(response.content)) def handle_CrawlerURLMessage(self, target_url): self.show_timestamped_message("Crawling ... %s" % target_url) if self.is_page_status_allow_to_fetch(target_url): if self.is_contents_status_allow_to_fetch(target_url): page_info = self.do_fetch(target_url) else: page_info = PageInfo(url=target_url, status=PageInfo.UGNORED, raw_contents=None) self.page_storage.set_page(page_info) self.show_timestamped_message("Stored : %s" % target_url) self.parser_task(page_info) else: self.show_timestamped_message("Ignored : %s" % target_url) page_info = self.page_storege.get_page(target_url) self.parser_task(page_info)
def __init__(self, indexer, link_queue=None): super(HTMLParser, self).__init__() self.page_storage_client = PageStorageClient() self.indexer = indexer self.link_queue = link_queue
class HTMLParser(Task): IgnoreTagNames = ("script", "noscript", "object", "applet") def __init__(self, indexer, link_queue=None): super(HTMLParser, self).__init__() self.page_storage_client = PageStorageClient() self.indexer = indexer self.link_queue = link_queue def __call__(self, page_info): self.put_message(ParseHTMLMessage(page_info)) def remove_xml_declaration(self, text): return re.sub(r"^\<\?xml\ .*\?\>\n", "", text) def gather_white_spaces(self, text): return re.sub(r"\s+", " ", text).strip() def collect_text(self, document_tree): result = list() for element in document_tree.iter(): if isinstance(element.tag, str) and (element.tag not in self.IgnoreTagNames): if element.text is not None: text = self.gather_white_spaces(element.text) if text != "": result.append(text) return " ".join(result) def extract_link_urls(self, document_tree): return set([e.attrib["href"] for e in document_tree.xpath("//a[@href]")]) def hancle_ParseHTMLMessage(self, page_info): self.show_timestamped_message("Parsing ... %s" % page_info.url) contents = self.remove_xml_declaration(page_info.raw_contents) document_tree = lxml.html.fromstring(contents, base_url=page_info.url) document_tree.make_links_absolute() if not(page_info.status & PageInfo.PARSED): title_elements = document_tree.xpath("//title") if title_elements: page_info.title = title_elements[0].text.strip() else: page_info.title = None page_info.text = TextUtils.normalize_text(self.collect_text(document_tree)) self.page_storage_client.set_page(page_info) self.indexer(page_info) if self.link_queue: for link_url in self.extract_link_urls(document_tree): self.link_queue(link_url)
if title_elements: page_info.title = title_elements[0].text.strip() else: page_info.title = None page_info.text = TextUtils.normalize_text(self.collect_text(document_tree)) self.page_storage_client.set_page(page_info) self.indexer(page_info) if self.link_queue: for link_url in self.extract_link_urls(document_tree): self.link_queue(link_url) if __name__ == "__main__": import sys def dummy_indexer(page_info): print("dummy_indexer(%s)" % page_info) def dummy_queue(url): print(url) page_storage = PageStorageClient() page_info = page_storage.get_page("http://www.xlisting.co.jp/index.html") parser = HTMLParser(dummy_indexer, link_queue=dummy_queue) parser.handle_ParseHTMLMessage(page_info)
def __init__(self, splitter): super(Indexer, self).__init__(self) self.page_storage = PageStorageClient() self.index_storage = IndexStorageClient() self.splitter = splitter