예제 #1
0
    def __init__(self, html, base_url=''):
        # see http://stackoverflow.com/questions/14946264/python-lru-cache-decorator-per-instance
        self.calc_img_area_len = lru_cache(1024)(self.calc_img_area_len)
        self.calc_effective_text_len = lru_cache(1024)(self.calc_effective_text_len)
        self.parents_of_article_header = lru_cache(1024)(self.parents_of_article_header)

        self.max_score = -1
        # dict uses __eq__ to identify key, while in BS two different nodes
        # will also be considered equal, DO not use that
        self.scores = defaultdict(int)
        doc = BS(html)

        self.title = (doc.title.string if doc.title else u'') or u''
        self.article = doc
        self.base_url = base_url
        self.purge(doc)
        self.find_main_content(doc)

        # clean ups
        # self.clean_up_html()
        self.relative_path2_abs_url()
예제 #2
0
    def __init__(self, html, url=''):
        # see http://stackoverflow.com/questions/14946264/python-lru-cache-decorator-per-instance
        self.calc_img_area_len = lru_cache(1024)(self.calc_img_area_len)
        # self.calc_effective_text_len = lru_cache(1024)(self.calc_effective_text_len)

        self.max_score = -1
        # dict uses __eq__ to identify key, while in BS two different nodes
        # will also be considered equal, DO not use that
        self.scores = defaultdict(int)
        self.doc = BS(html)

        self.title = (self.doc.title.string if self.doc.title else u'') or u''
        self.article = Null
        self.url = url
        # call it before purge
        self.get_favicon_url()
        self.purge()
        self.find_main_content()

        # clean ups
        # self.clean_up_html()
        self.relative_path2_abs_url()
예제 #3
0
    def __init__(self, html, url=""):
        # see http://stackoverflow.com/questions/14946264/python-lru-cache-decorator-per-instance
        self.calc_img_area_len = lru_cache(1024)(self.calc_img_area_len)
        # self.calc_effective_text_len = lru_cache(1024)(self.calc_effective_text_len)

        self.max_score = -1
        # dict uses __eq__ to identify key, while in BS two different nodes
        # will also be considered equal, DO not use that
        self.scores = defaultdict(int)
        self.doc = BS(html)

        self.title = (self.doc.title.string if self.doc.title else u"") or u""
        self.article = Null
        self.url = url
        # call it before purge
        self.get_favicon_url()
        self.purge()
        self.find_main_content()

        # clean ups
        # self.clean_up_html()
        self.relative_path2_abs_url()
예제 #4
0
    def registerFunc(cls, func, maxSize):

        cls._funcs[cls.funcName(func)] = local_lru.lru_cache(maxsize=maxSize)(func)