def getBaselineScoreForSiblings(self, topNode): """\ we could have long articles that have tons of paragraphs so if we tried to calculate the base score against the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of 100 then 100 should be our base. """ base = 100000 numberOfParagraphs = 0 scoreOfParagraphs = 0 nodesToCheck = Parser.getElementsByTag(topNode, tag='p') for node in nodesToCheck: nodeText = Parser.getText(node) wordStats = StopWords().getStopWordCount(nodeText) highLinkDensity = self.isHighLinkDensity(node) if wordStats.getStopWordCount() > 2 and not highLinkDensity: numberOfParagraphs += 1 scoreOfParagraphs += wordStats.getStopWordCount() if numberOfParagraphs > 0: base = scoreOfParagraphs / numberOfParagraphs return base
def getBaselineScoreForSiblings(self, topNode): """\ we could have long articles that have tons of paragraphs so if we tried to calculate the base score against the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of 100 then 100 should be our base. """ base = 100000 numberOfParagraphs = 0 scoreOfParagraphs = 0 nodesToCheck = Parser.getElementsByTag(topNode, tag='p') for node in nodesToCheck: nodeText = Parser.getText(node) wordStats = StopWords( language=self.language).getStopWordCount(nodeText) highLinkDensity = self.isHighLinkDensity(node) if wordStats.getStopWordCount() > 2 and not highLinkDensity: numberOfParagraphs += 1 scoreOfParagraphs += wordStats.getStopWordCount() if numberOfParagraphs > 0: base = scoreOfParagraphs / numberOfParagraphs return base
def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs): """\ adds any siblings that may have a decent score to this node """ if currentSibling.tag == 'p' and len(Parser.getText(currentSibling)) > 0: e0 = currentSibling if e0.tail: e0 = deepcopy(e0) e0.tail = '' return [e0] else: potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p') if potentialParagraphs is None: return None else: ps = [] for firstParagraph in potentialParagraphs: text = Parser.getText(firstParagraph) if len(text) > 0: wordStats = StopWords().getStopWordCount(text) paragraphScore = wordStats.getStopWordCount() siblingBaseLineScore = float(.30) highLinkDensity = self.isHighLinkDensity(firstParagraph) score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore) if score < paragraphScore and not highLinkDensity: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) return ps
def isOkToBoost(self, node): """\ alot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it """ para = "p" stepsAway = 0 minimumStopWordCount = 5 maxStepsAwayFromNode = 3 nodes = self.walkSiblings(node) for currentNode in nodes: # p if currentNode.tag == para: if stepsAway >= maxStepsAwayFromNode: return False paraText = Parser.getText(currentNode) wordStats = StopWords( language=self.language).getStopWordCount(paraText) if wordStats.getStopWordCount > minimumStopWordCount: return True stepsAway += 1 return False
def removeParagraphsWithFewWords(self): """\ remove paragraphs that have less than x number of words, would indicate that it's some sort of link """ allNodes = Parser.getElementsByTags(self.getTopNode(),['*'])#.cssselect('*') allNodes.reverse() for el in allNodes: text = Parser.getText(el) stopWords = StopWords().getStopWordCount(text) if stopWords.getStopWordCount() < 3 \ and len(Parser.getElementsByTag(el, tag='object')) == 0 \ and len(Parser.getElementsByTag(el, tag='embed')) == 0: Parser.remove(el) # TODO # check if it is in the right place else: trimmed = Parser.getText(el) if trimmed.startswith("(") and trimmed.endswith(")"): Parser.remove(el)
def removeParagraphsWithFewWords(self): """\ remove paragraphs that have less than x number of words, would indicate that it's some sort of link """ allNodes = Parser.getElementsByTags(self.getTopNode(), ['*']) #.cssselect('*') allNodes.reverse() for el in allNodes: text = Parser.getText(el) stopWords = StopWords().getStopWordCount(text) if stopWords.getStopWordCount() < 3 \ and len(Parser.getElementsByTag(el, tag='object')) == 0 \ and len(Parser.getElementsByTag(el, tag='embed')) == 0: Parser.remove(el) # TODO # check if it is in the right place else: trimmed = Parser.getText(el) if trimmed.startswith("(") and trimmed.endswith(")"): Parser.remove(el)
def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs): """\ adds any siblings that may have a decent score to this node """ if currentSibling.tag == 'p' and len( Parser.getText(currentSibling)) > 0: e0 = currentSibling if e0.tail: e0 = deepcopy(e0) e0.tail = '' return [e0] else: potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p') if potentialParagraphs is None: return None else: ps = [] for firstParagraph in potentialParagraphs: text = Parser.getText(firstParagraph) if len(text) > 0: wordStats = StopWords( language=self.language).getStopWordCount(text) paragraphScore = wordStats.getStopWordCount() siblingBaseLineScore = float(.30) highLinkDensity = self.isHighLinkDensity( firstParagraph) score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore) if score < paragraphScore and not highLinkDensity: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) return ps
def calculateBestNodeBasedOnClustering(self, article): doc = article.doc topNode = None nodesToCheck = self.getNodesToCheck(doc) startingBoost = float(1.0) cnt = 0 i = 0 parentNodes = set() nodesWithText = [] for node in nodesToCheck: nodeText = Parser.getText(node) wordStats = StopWords( language=self.language).getStopWordCount(nodeText) highLinkDensity = self.isHighLinkDensity(node) if wordStats.getStopWordCount() > 2 and not highLinkDensity: nodesWithText.append(node) numberOfNodes = len(nodesWithText) negativeScoring = 0 bottomNodesForNegativeScore = float(numberOfNodes) * 0.25 for node in nodesWithText: boostScore = float(0) # boost if (self.isOkToBoost(node)): if cnt >= 0: boostScore = float((1.0 / startingBoost) * 50) startingBoost += 1 # numberOfNodes if numberOfNodes > 15: if (numberOfNodes - i) <= bottomNodesForNegativeScore: booster = float(bottomNodesForNegativeScore - (numberOfNodes - i)) boostScore = float(-pow(booster, float(2))) negscore = -abs(boostScore) + negativeScoring if negscore > 40: boostScore = float(5) nodeText = Parser.getText(node) wordStats = StopWords( language=self.language).getStopWordCount(nodeText) upscore = int(wordStats.getStopWordCount() + boostScore) # parent node parentNode = Parser.getParent(node) self.updateScore(parentNode, upscore) self.updateNodeCount(node.getparent(), 1) if node.getparent() not in parentNodes: parentNodes.add(node.getparent()) # parentparent node parentParentNode = Parser.getParent(parentNode) if parentParentNode is not None: self.updateNodeCount(parentParentNode, 1) self.updateScore(parentParentNode, upscore / 2) if parentParentNode not in parentNodes: parentNodes.add(parentParentNode) cnt += 1 i += 1 topNodeScore = 0 for e in parentNodes: score = self.getScore(e) if score > topNodeScore: topNode = e topNodeScore = score if topNode is None: topNode = e return topNode
def calculateBestNodeBasedOnClustering(self, article): doc = article.doc topNode = None nodesToCheck = self.getNodesToCheck(doc) startingBoost = float(1.0) cnt = 0 i = 0 parentNodes = set() nodesWithText = [] for node in nodesToCheck: nodeText = Parser.getText(node) wordStats = StopWords().getStopWordCount(nodeText) highLinkDensity = self.isHighLinkDensity(node) if wordStats.getStopWordCount() > 2 and not highLinkDensity: nodesWithText.append(node) numberOfNodes = len(nodesWithText) negativeScoring = 0 bottomNodesForNegativeScore = float(numberOfNodes) * 0.25 for node in nodesWithText: boostScore = float(0) # boost if(self.isOkToBoost(node)): if cnt >= 0: boostScore = float((1.0 / startingBoost) * 50) startingBoost += 1 # numberOfNodes if numberOfNodes > 15: if (numberOfNodes - i) <= bottomNodesForNegativeScore: booster = float(bottomNodesForNegativeScore - (numberOfNodes - i)) boostScore = float(-pow(booster, float(2))) negscore = -abs(boostScore) + negativeScoring if negscore > 40: boostScore = float(5) nodeText = Parser.getText(node) wordStats = StopWords().getStopWordCount(nodeText) upscore = int(wordStats.getStopWordCount() + boostScore) # parent node parentNode = Parser.getParent(node) self.updateScore(parentNode, upscore) self.updateNodeCount(node.getparent(), 1) if node.getparent() not in parentNodes: parentNodes.add(node.getparent()) # parentparent node parentParentNode = Parser.getParent(parentNode) if parentParentNode is not None: self.updateNodeCount(parentParentNode, 1) self.updateScore(parentParentNode, upscore / 2) if parentParentNode not in parentNodes: parentNodes.add(parentParentNode) cnt += 1 i += 1 topNodeScore = 0 for e in parentNodes: score = self.getScore(e) if score > topNodeScore: topNode = e topNodeScore = score if topNode is None: topNode = e return topNode