Esempio n. 1
0
 def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs, good_path):
     """\
     adds any siblings that may have a decent score to this node
     """
     if currentSibling.tag == 'p' and len(Parser.getText(currentSibling)) > 0:
         e0 = currentSibling
         if e0.tail:
             e0 = deepcopy(e0)
             e0.tail = ''
         return [e0]
     else:
         potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p')
         if potentialParagraphs is None:
             return None
         else:
             ps = []
             for firstParagraph in potentialParagraphs:
                 path = Parser.getPath(firstParagraph)
                 text = Parser.getText(firstParagraph)
                 if path == good_path and len(Parser.getElementsByTag(firstParagraph, tag='a')) == 0:
                     p = Parser.createElement(tag='p', text=text, tail=None)
                     ps.append(p)
                     continue
                 if len(text) > 0:
                     wordStats = self.stopwordsCls(language=self.language).getStopWordCount(text)
                     paragraphScore = wordStats.getStopWordCount()
                     siblingBaseLineScore = float(.30)
                     highLinkDensity = self.isHighLinkDensity(firstParagraph)
                     score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore)
                     if score < paragraphScore and not highLinkDensity:
                         p = Parser.createElement(tag='p', text=text, tail=None)
                         ps.append(p)
             return ps
Esempio n. 2
0
 def get_siblings_content(self, current_sibling, baselinescore_siblings_para):
     """\
     adds any siblings that may have a decent score to this node
     """
     if current_sibling.tag == 'p' and len(Parser.getText(current_sibling)) > 0:
         e0 = current_sibling
         if e0.tail:
             e0 = deepcopy(e0)
             e0.tail = ''
         return [e0]
     else:
         potential_paragraphs = Parser.getElementsByTag(current_sibling, tag='p')
         if potential_paragraphs is None:
             return None
         else:
             ps = []
             for first_paragraph in potential_paragraphs:
                 text = Parser.getText(first_paragraph)
                 if len(text) > 0:
                     word_stats = self.stopwords_class(language=self.language).get_stopword_count(text)
                     paragraph_score = word_stats.get_stopword_count()
                     sibling_baseline_score = float(.30)
                     high_link_density = self.is_highlink_density(first_paragraph)
                     score = float(baselinescore_siblings_para * sibling_baseline_score)
                     if score < paragraph_score and not high_link_density:
                         p = Parser.createElement(tag='p', text=text, tail=None)
                         ps.append(p)
             return ps
Esempio n. 3
0
 def get_siblings_content(self, current_sibling, baselinescore_siblings_para):
     """\
     adds any siblings that may have a decent score to this node
     """
     if current_sibling.tag == 'p' and len(Parser.getText(current_sibling)) > 0:
         e0 = current_sibling
         if e0.tail:
             e0 = deepcopy(e0)
             e0.tail = ''
         return [e0]
     else:
         potential_paragraphs = Parser.getElementsByTag(current_sibling, tag='p')
         if potential_paragraphs is None:
             return None
         else:
             ps = []
             for first_paragraph in potential_paragraphs:
                 text = Parser.getText(first_paragraph)
                 if len(text) > 0:
                     word_stats = self.stopwords_class(language=self.language).get_stopword_count(text)
                     paragraph_score = word_stats.get_stopword_count()
                     sibling_baseline_score = float(.30)
                     high_link_density = self.is_highlink_density(first_paragraph)
                     score = float(baselinescore_siblings_para * sibling_baseline_score)
                     if score < paragraph_score and not high_link_density:
                         p = Parser.createElement(tag='p', text=text, tail=None)
                         ps.append(p)
             return ps
Esempio n. 4
0
    def postExtractionCleanup(self, targetNode):
        """\
        remove any divs that looks like non-content,
        clusters of links, or paras with no gusto
        """
        if targetNode.text is not None:
           e = Parser.createElement(text=targetNode.text)
           targetNode.text = None
           targetNode.insert(0, e)

        node = self.addSiblings(targetNode)

        for e in node:
            if e.tag in ('h2','h3','h4'): continue
            if e.tag not in ('p','pre','font'):
                textLen,stopCount,isHighLink = self.getTextStats(e)
                if isHighLink \
                    or self.isTableTagAndNoParagraphsExist(e) \
                    or not self.isNodeScoreThreshholdMet(node, e):
                    Parser.remove(e)

        for e in reversed(node):
            if e.tag not in ('h2','h3','h4'): break
            Parser.remove(e)

        return node
Esempio n. 5
0
    def postExtractionCleanup(self, targetNode):
        """\
        remove any divs that looks like non-content,
        clusters of links, or paras with no gusto
        """
        if targetNode.text is not None:
           e = Parser.createElement(text=targetNode.text)
           targetNode.text = None
           targetNode.insert(0, e)

        node = self.addSiblings(targetNode)
        for e in node:
            if e.tag in ['h2','h3','h4']: continue
            if e.tag not in ['p','pre','font']:
                if self.isHighLinkDensity(e) \
                    or self.isTableTagAndNoParagraphsExist(e) \
                    or not self.isNodeScoreThreshholdMet(node, e):
                    Parser.remove(e)

        for e in reversed(node):
            if e.tag == 'p' and list(e) == []:
               if e.text is None or re.search('[^ \t\r\n]',e.text) == None:
                   Parser.remove(e)
                   continue
            if e.tag not in ['h2','h3','h4']: break
            Parser.remove(e)
        return node
Esempio n. 6
0
    def getReplacementNodes(self, div):

        replacementText = []
        nodesToReturn = []
        p = Parser.createElement(tag='p', text='', tail=None)
        last_inline_node = None
        if div.text is not None: 
            div.text = self.parser.unescape(div.text).strip('\t\r\n')
            if len(div.text): replacementText.append(div.text)

        for kid in list(div):
            if kid.tail is not None: kid.tail = self.parser.unescape(kid.tail).strip('\t\r\n')
            if replacementText: 
                text = ''.join(replacementText)
                replacementText = []
                if len(p):  last_inline_node.tail = text
                else: p.text = text
            if kid.tag in self.goodInlineTags:
                p.append(kid)
                last_inline_node = kid
            else:
                if len(p) or len(p.text):
                    nodesToReturn.append(p)
                    p = Parser.createElement(tag='p', text='', tail=None)
                if kid.tail is not None and len(kid.tail): replacementText.append(kid.tail)
                kid.tail = None
                nodesToReturn.append(kid)

        # flush out anything still remaining
        if replacementText:
            text = ''.join(replacementText)
            if len(p):  last_inline_node.tail = text
            else: p.text = text
        if len(p) or len(p.text): nodesToReturn.append(p)

        return nodesToReturn
Esempio n. 7
0
 def getSiblingContent(self, currentSibling,
                       baselineScoreForSiblingParagraphs):
     """\
     adds any siblings that may have a decent score to this node
     """
     if currentSibling.tag == 'p' and len(
             Parser.getText(currentSibling)) > 0:
         e0 = currentSibling
         if e0.tail:
             e0 = deepcopy(e0)
             e0.tail = ''
         return [e0]
     else:
         potentialParagraphs = Parser.getElementsByTag(currentSibling,
                                                       tag='p')
         if potentialParagraphs is None:
             return None
         else:
             ps = []
             for firstParagraph in potentialParagraphs:
                 text = Parser.getText(firstParagraph)
                 if len(text) > 0:
                     wordStats = StopWords(
                         language=self.language).getStopWordCount(text)
                     paragraphScore = wordStats.getStopWordCount()
                     siblingBaseLineScore = float(.30)
                     highLinkDensity = self.isHighLinkDensity(
                         firstParagraph)
                     score = float(baselineScoreForSiblingParagraphs *
                                   siblingBaseLineScore)
                     if score < paragraphScore and not highLinkDensity:
                         p = Parser.createElement(tag='p',
                                                  text=text,
                                                  tail=None)
                         ps.append(p)
             return ps