コード例 #1
0
	def get_article(self, candidates, best_candidate):
		# Now that we have the top candidate, look through its siblings for content that might also be related.
		# Things like preambles, content split by ads that we removed, etc.

		sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
		output = parse("<div/>")
		for sibling in best_candidate['elem'].parent.contents:
			if isinstance(sibling, NavigableString): continue
			append = False
			if sibling is best_candidate['elem']:
				append = True
			sibling_key = HashableElement(sibling)
			if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
				append = True

			if sibling.name == "p":
				link_density = self.get_link_density(sibling)
				node_content = sibling.string or ""
				node_length = len(node_content)

				if node_length > 80 and link_density < 0.25:
					append = True
				elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
					append = True

			if append:
				output.append(sibling)

		if not output: output.append(best_candidate)
		return output
コード例 #2
0
	def _html(self, force=False):
		if force or self.html is None:
			self.html = parse(self.input, self.options['url'], notify=self.notify)
		return self.html