def __init__(self, html, base_url=''): # see http://stackoverflow.com/questions/14946264/python-lru-cache-decorator-per-instance self.calc_img_area_len = lru_cache(1024)(self.calc_img_area_len) self.calc_effective_text_len = lru_cache(1024)(self.calc_effective_text_len) self.parents_of_article_header = lru_cache(1024)(self.parents_of_article_header) self.max_score = -1 # dict uses __eq__ to identify key, while in BS two different nodes # will also be considered equal, DO not use that self.scores = defaultdict(int) doc = BS(html) self.title = (doc.title.string if doc.title else u'') or u'' self.article = doc self.base_url = base_url self.purge(doc) self.find_main_content(doc) # clean ups # self.clean_up_html() self.relative_path2_abs_url()
def __init__(self, html, url=''): # see http://stackoverflow.com/questions/14946264/python-lru-cache-decorator-per-instance self.calc_img_area_len = lru_cache(1024)(self.calc_img_area_len) # self.calc_effective_text_len = lru_cache(1024)(self.calc_effective_text_len) self.max_score = -1 # dict uses __eq__ to identify key, while in BS two different nodes # will also be considered equal, DO not use that self.scores = defaultdict(int) self.doc = BS(html) self.title = (self.doc.title.string if self.doc.title else u'') or u'' self.article = Null self.url = url # call it before purge self.get_favicon_url() self.purge() self.find_main_content() # clean ups # self.clean_up_html() self.relative_path2_abs_url()
def __init__(self, html, url=""): # see http://stackoverflow.com/questions/14946264/python-lru-cache-decorator-per-instance self.calc_img_area_len = lru_cache(1024)(self.calc_img_area_len) # self.calc_effective_text_len = lru_cache(1024)(self.calc_effective_text_len) self.max_score = -1 # dict uses __eq__ to identify key, while in BS two different nodes # will also be considered equal, DO not use that self.scores = defaultdict(int) self.doc = BS(html) self.title = (self.doc.title.string if self.doc.title else u"") or u"" self.article = Null self.url = url # call it before purge self.get_favicon_url() self.purge() self.find_main_content() # clean ups # self.clean_up_html() self.relative_path2_abs_url()
def registerFunc(cls, func, maxSize): cls._funcs[cls.funcName(func)] = local_lru.lru_cache(maxsize=maxSize)(func)