def weight_links(self, window_width = 1): """ result: [(link, weight)]: - link is BeautifulSoup tag (<a>) - weight is integer """ content_list, links = ContentExtractor.extract_mapped_content_list(self._body_soup) link_count = len(links) if not self._words: self._words = [s for s in content_list if type(s) != int] self._weighted_words = map(self._weight_words, self._words) weights = self._weighted_words left = 0 right = window_width win_weight = reduce( lambda x,y: x + y, weights[0:min(window_width,link_count)], 0 ) weighted_links = [] if link_count <= window_width: return map( lambda x: (x, win_weight), links ) for i in xrange(window_width): weighted_links.append( (links[i], win_weight) ) right += 1 win_weight += weights[right] for i in xrange(window_width, link_count - window_width): weighted_links.append( (links[i], win_weight) ) left += 1 right += 1 win_weight += weights[right] - weights[left] for i in xrange( link_count - window_width, link_count ): weighted_links.append( (links[i], win_weight) ) left += 1 win_weight -= weights[left] return weighted_links
def get_page_weight(self): content_list = ContentExtractor.extract_mapped_content_list(self._body_soup)[0] if not self._words: self._words = [s for s in content_list if type(s) != int] self._weighted_words = map(self._weight_words, self._words) return sum(self._weighted_words)