Example #1
0
 def getBaselineScoreForSiblings(self, topNode):
     """\
     we could have long articles that have tons of paragraphs 
     so if we tried to calculate the base score against
     the total text score of those paragraphs it would be unfair. 
     So we need to normalize the score based on the average scoring
     of the paragraphs within the top node. 
     For example if our total score of 10 paragraphs was 1000 
     but each had an average value of 100 then 100 should be our base.
     """
     base = 100000
     numberOfParagraphs = 0
     scoreOfParagraphs = 0
     nodesToCheck = Parser.getElementsByTag(topNode, tag='p')
     
     for node in nodesToCheck:
         nodeText = Parser.getText(node)
         wordStats = StopWords().getStopWordCount(nodeText)
         highLinkDensity = self.isHighLinkDensity(node)
         if wordStats.getStopWordCount() > 2 and not highLinkDensity:
             numberOfParagraphs += 1
             scoreOfParagraphs += wordStats.getStopWordCount()
     
     if numberOfParagraphs > 0:
         base = scoreOfParagraphs / numberOfParagraphs
     
     return base
Example #2
0
    def getBaselineScoreForSiblings(self, topNode):
        """\
        we could have long articles that have tons of paragraphs
        so if we tried to calculate the base score against
        the total text score of those paragraphs it would be unfair.
        So we need to normalize the score based on the average scoring
        of the paragraphs within the top node.
        For example if our total score of 10 paragraphs was 1000
        but each had an average value of 100 then 100 should be our base.
        """
        base = 100000
        numberOfParagraphs = 0
        scoreOfParagraphs = 0
        nodesToCheck = Parser.getElementsByTag(topNode, tag='p')

        for node in nodesToCheck:
            nodeText = Parser.getText(node)
            wordStats = StopWords(
                language=self.language).getStopWordCount(nodeText)
            highLinkDensity = self.isHighLinkDensity(node)
            if wordStats.getStopWordCount() > 2 and not highLinkDensity:
                numberOfParagraphs += 1
                scoreOfParagraphs += wordStats.getStopWordCount()

        if numberOfParagraphs > 0:
            base = scoreOfParagraphs / numberOfParagraphs

        return base
Example #3
0
 def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs):
     """\
     adds any siblings that may have a decent score to this node
     """
     if currentSibling.tag == 'p' and len(Parser.getText(currentSibling)) > 0:
         e0 = currentSibling
         if e0.tail:
             e0 = deepcopy(e0)
             e0.tail = ''
         return [e0]
     
     else:
         potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p')
         if potentialParagraphs is None:
             return None
         else:
             ps = []
             for firstParagraph in potentialParagraphs:
                 text = Parser.getText(firstParagraph)
                 if len(text) > 0:
                     wordStats = StopWords().getStopWordCount(text)
                     paragraphScore = wordStats.getStopWordCount()
                     siblingBaseLineScore = float(.30)
                     highLinkDensity = self.isHighLinkDensity(firstParagraph)
                     score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore)
                     if score < paragraphScore and not highLinkDensity:
                         p = Parser.createElement(tag='p', text=text, tail=None)
                         ps.append(p)
             return ps
Example #4
0
    def isOkToBoost(self, node):
        """\
        alot of times the first paragraph might be the caption under an image
        so we'll want to make sure if we're going to boost a parent node that
        it should be connected to other paragraphs,
        at least for the first n paragraphs so we'll want to make sure that
        the next sibling is a paragraph and has at
        least some substatial weight to it
        """
        para = "p"
        stepsAway = 0
        minimumStopWordCount = 5
        maxStepsAwayFromNode = 3

        nodes = self.walkSiblings(node)
        for currentNode in nodes:
            # p
            if currentNode.tag == para:
                if stepsAway >= maxStepsAwayFromNode:
                    return False
                paraText = Parser.getText(currentNode)
                wordStats = StopWords(
                    language=self.language).getStopWordCount(paraText)
                if wordStats.getStopWordCount > minimumStopWordCount:
                    return True
                stepsAway += 1
        return False
 def removeParagraphsWithFewWords(self):
     """\
     remove paragraphs that have less than x number of words, 
     would indicate that it's some sort of link
     """
     allNodes = Parser.getElementsByTags(self.getTopNode(),['*'])#.cssselect('*')
     allNodes.reverse()
     for el in allNodes:
         text = Parser.getText(el)
         stopWords = StopWords().getStopWordCount(text)
         if stopWords.getStopWordCount() < 3 \
             and len(Parser.getElementsByTag(el, tag='object')) == 0 \
             and len(Parser.getElementsByTag(el, tag='embed')) == 0:
             Parser.remove(el)
         # TODO
         # check if it is in the right place
         else:
             trimmed = Parser.getText(el)
             if trimmed.startswith("(") and trimmed.endswith(")"):
                 Parser.remove(el)
Example #6
0
 def removeParagraphsWithFewWords(self):
     """\
     remove paragraphs that have less than x number of words, 
     would indicate that it's some sort of link
     """
     allNodes = Parser.getElementsByTags(self.getTopNode(),
                                         ['*'])  #.cssselect('*')
     allNodes.reverse()
     for el in allNodes:
         text = Parser.getText(el)
         stopWords = StopWords().getStopWordCount(text)
         if stopWords.getStopWordCount() < 3 \
             and len(Parser.getElementsByTag(el, tag='object')) == 0 \
             and len(Parser.getElementsByTag(el, tag='embed')) == 0:
             Parser.remove(el)
         # TODO
         # check if it is in the right place
         else:
             trimmed = Parser.getText(el)
             if trimmed.startswith("(") and trimmed.endswith(")"):
                 Parser.remove(el)
Example #7
0
 def getSiblingContent(self, currentSibling,
                       baselineScoreForSiblingParagraphs):
     """\
     adds any siblings that may have a decent score to this node
     """
     if currentSibling.tag == 'p' and len(
             Parser.getText(currentSibling)) > 0:
         e0 = currentSibling
         if e0.tail:
             e0 = deepcopy(e0)
             e0.tail = ''
         return [e0]
     else:
         potentialParagraphs = Parser.getElementsByTag(currentSibling,
                                                       tag='p')
         if potentialParagraphs is None:
             return None
         else:
             ps = []
             for firstParagraph in potentialParagraphs:
                 text = Parser.getText(firstParagraph)
                 if len(text) > 0:
                     wordStats = StopWords(
                         language=self.language).getStopWordCount(text)
                     paragraphScore = wordStats.getStopWordCount()
                     siblingBaseLineScore = float(.30)
                     highLinkDensity = self.isHighLinkDensity(
                         firstParagraph)
                     score = float(baselineScoreForSiblingParagraphs *
                                   siblingBaseLineScore)
                     if score < paragraphScore and not highLinkDensity:
                         p = Parser.createElement(tag='p',
                                                  text=text,
                                                  tail=None)
                         ps.append(p)
             return ps
Example #8
0
    def calculateBestNodeBasedOnClustering(self, article):
        doc = article.doc
        topNode = None
        nodesToCheck = self.getNodesToCheck(doc)

        startingBoost = float(1.0)
        cnt = 0
        i = 0
        parentNodes = set()
        nodesWithText = []

        for node in nodesToCheck:
            nodeText = Parser.getText(node)
            wordStats = StopWords(
                language=self.language).getStopWordCount(nodeText)
            highLinkDensity = self.isHighLinkDensity(node)
            if wordStats.getStopWordCount() > 2 and not highLinkDensity:
                nodesWithText.append(node)

        numberOfNodes = len(nodesWithText)
        negativeScoring = 0
        bottomNodesForNegativeScore = float(numberOfNodes) * 0.25

        for node in nodesWithText:
            boostScore = float(0)
            # boost
            if (self.isOkToBoost(node)):
                if cnt >= 0:
                    boostScore = float((1.0 / startingBoost) * 50)
                    startingBoost += 1
            # numberOfNodes
            if numberOfNodes > 15:
                if (numberOfNodes - i) <= bottomNodesForNegativeScore:
                    booster = float(bottomNodesForNegativeScore -
                                    (numberOfNodes - i))
                    boostScore = float(-pow(booster, float(2)))
                    negscore = -abs(boostScore) + negativeScoring
                    if negscore > 40:
                        boostScore = float(5)

            nodeText = Parser.getText(node)
            wordStats = StopWords(
                language=self.language).getStopWordCount(nodeText)
            upscore = int(wordStats.getStopWordCount() + boostScore)

            # parent node
            parentNode = Parser.getParent(node)
            self.updateScore(parentNode, upscore)
            self.updateNodeCount(node.getparent(), 1)

            if node.getparent() not in parentNodes:
                parentNodes.add(node.getparent())

            # parentparent node
            parentParentNode = Parser.getParent(parentNode)
            if parentParentNode is not None:
                self.updateNodeCount(parentParentNode, 1)
                self.updateScore(parentParentNode, upscore / 2)
                if parentParentNode not in parentNodes:
                    parentNodes.add(parentParentNode)
            cnt += 1
            i += 1

        topNodeScore = 0
        for e in parentNodes:
            score = self.getScore(e)

            if score > topNodeScore:
                topNode = e
                topNodeScore = score

            if topNode is None:
                topNode = e

        return topNode
Example #9
0
 def calculateBestNodeBasedOnClustering(self, article):
     doc = article.doc
     topNode = None
     nodesToCheck = self.getNodesToCheck(doc)
     
     startingBoost = float(1.0)
     cnt = 0
     i = 0
     parentNodes = set()
     nodesWithText = []
     
     for node in nodesToCheck:
         nodeText = Parser.getText(node)
         wordStats = StopWords().getStopWordCount(nodeText)
         highLinkDensity = self.isHighLinkDensity(node)
         if wordStats.getStopWordCount() > 2 and not highLinkDensity:
             nodesWithText.append(node)
             
     numberOfNodes = len(nodesWithText)
     negativeScoring = 0
     bottomNodesForNegativeScore = float(numberOfNodes) * 0.25
     
     for node in nodesWithText:
         boostScore = float(0)
         # boost
         if(self.isOkToBoost(node)):
             if cnt >= 0:
                 boostScore = float((1.0 / startingBoost) * 50)
                 startingBoost += 1
         # numberOfNodes
         if numberOfNodes > 15:
             if (numberOfNodes - i) <= bottomNodesForNegativeScore:
                 booster = float(bottomNodesForNegativeScore - (numberOfNodes - i))
                 boostScore = float(-pow(booster, float(2)))
                 negscore = -abs(boostScore) + negativeScoring
                 if negscore > 40:
                     boostScore = float(5)
         
         nodeText = Parser.getText(node)
         wordStats = StopWords().getStopWordCount(nodeText)
         upscore = int(wordStats.getStopWordCount() + boostScore)
         
         # parent node
         parentNode = Parser.getParent(node)
         self.updateScore(parentNode, upscore)
         self.updateNodeCount(node.getparent(), 1)
         
         if node.getparent() not in parentNodes:
             parentNodes.add(node.getparent())
         
         # parentparent node
         parentParentNode = Parser.getParent(parentNode)
         if parentParentNode is not None:
             self.updateNodeCount(parentParentNode, 1)
             self.updateScore(parentParentNode, upscore / 2)
             if parentParentNode not in parentNodes:
                 parentNodes.add(parentParentNode)
         cnt += 1
         i += 1
     
     topNodeScore = 0
     for e in parentNodes:
         score = self.getScore(e)
         
         if score > topNodeScore:
             topNode = e
             topNodeScore = score
         
         if topNode is None:
             topNode = e
     
     return topNode