Ejemplo n.º 1
0
 def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs, good_path):
     """\
     adds any siblings that may have a decent score to this node
     """
     if currentSibling.tag == 'p' and len(Parser.getText(currentSibling)) > 0:
         e0 = currentSibling
         if e0.tail:
             e0 = deepcopy(e0)
             e0.tail = ''
         return [e0]
     else:
         potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p')
         if potentialParagraphs is None:
             return None
         else:
             ps = []
             for firstParagraph in potentialParagraphs:
                 path = Parser.getPath(firstParagraph)
                 text = Parser.getText(firstParagraph)
                 if path == good_path and len(Parser.getElementsByTag(firstParagraph, tag='a')) == 0:
                     p = Parser.createElement(tag='p', text=text, tail=None)
                     ps.append(p)
                     continue
                 if len(text) > 0:
                     wordStats = self.stopwordsCls(language=self.language).getStopWordCount(text)
                     paragraphScore = wordStats.getStopWordCount()
                     siblingBaseLineScore = float(.30)
                     highLinkDensity = self.isHighLinkDensity(firstParagraph)
                     score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore)
                     if score < paragraphScore and not highLinkDensity:
                         p = Parser.createElement(tag='p', text=text, tail=None)
                         ps.append(p)
             return ps
Ejemplo n.º 2
0
    def addSiblings(self, topNode):
        baselineScoreForSiblingParagraphs = self.getBaselineScoreForSiblings(topNode)
        results = self.walkSiblings(topNode)

        good_ps = Parser.getElementsByTag(topNode, tag='p')
        good_path = []
        if len(good_ps) > 0: good_path = Parser.getPath(good_ps[0])

        for currentNode in results:
            ps = self.getSiblingContent(currentNode, baselineScoreForSiblingParagraphs, good_path)
            for p in ps:
                topNode.insert(0, p)
        return topNode
Ejemplo n.º 3
0
    def addSiblings(self, topNode):
        baselineScoreForSiblingParagraphs = self.getBaselineScoreForSiblings(topNode)
        parent = topNode.getparent()
        if len(parent) == 1 and topNode.tail is None:
            results = self.walkSiblings(parent)
        else:
            results = self.walkSiblings(topNode)

        good_ps = topNode.find('.//p')
        good_path = []
        if good_ps is not None: good_path = Parser.getPath(good_ps)

        for currentNode in results:
            ps = self.getSiblingContent(currentNode, baselineScoreForSiblingParagraphs, good_path)
            for p in ps:
                topNode.insert(0, p)

        return topNode
Ejemplo n.º 4
0
 def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs, good_path):
     """\
     adds any siblings that may have a decent score to this node
     """
     if currentSibling.tag in ('p','h2','h3','h4') and len(Parser.getText(currentSibling)) > 0:
         return [currentSibling]
     else:
         potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p')
         if potentialParagraphs is None: return None
         else:
             ps = []
             for firstParagraph in potentialParagraphs:
                 path = Parser.getPath(firstParagraph)
                 textLen,stopCount,isHighLink = self.getTextStats(firstParagraph)
                 if path == good_path and not Parser.hasChildTag(firstParagraph, 'a'):
                     ps.insert(0,firstParagraph)
                     continue
                 if textLen > 0:
                     score = float(baselineScoreForSiblingParagraphs * 0.30)
                     if score < stopCount and not isHighLink:
                         ps.insert(0,firstParagraph)
             return ps