Exemple #1
0
 def walkSiblings(self, node):
     currentSibling = Parser.previousSibling(node)
     b = []
     while currentSibling is not None:
         b.append(currentSibling)
         currentSibling = Parser.previousSibling(currentSibling)
     return b
Exemple #2
0
    def getMetaLang(self, article):
        """\
        Extract content languages from metas
        """
        # we have a lang attribute in html
        meta_langs = []
        attr = Parser.getAttribute(article.doc, attr='lang')
        if attr is not None: meta_langs += attr.replace(' ','').lower().split(',')
        # look up for a Content-Language in meta
        attrs = {
            'http-equiv':'content-language',
            'name':'lang',
            'name':'og:lang',
        }
        head = article.doc.find('head')
        if head is not None:
            metas = Parser.getElementsByTag(head, tag='meta')
            for meta in metas:
                for attr in attrs:
                    if meta.attrib.get(attr,'').lower().startswith(attrs[attr]):
                        langs = meta.attrib.get('content',None)
                        if langs is not None: meta_langs += langs.replace(' ','').lower().split(',')
                if 'lang' in meta.attrib: meta_langs += meta.attrib['lang'].replace(' ','').lower().split(',')

        result = []
        for lang in meta_langs:
            lang = lang[:2]
            if re.search(RE_LANG, lang):
                result.append(lang)

        return result
Exemple #3
0
 def removeNodesViaRegEx(self, doc, pattern):
     for selector in ["id", "class"]:
         reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
         naughtyList = doc.xpath(reg, namespaces={"re": self.regexpNS})
         for node in naughtyList:
             Parser.remove(node)
     return doc
Exemple #4
0
 def test_childNodesWithText(self):
     html = '<html><body>'
     html += '<p>this is a test <a class="link">link</a> and this is <strong class="link">strong</strong></p>'
     html += '<p>this is a test and this is <strong class="link">strong</strong></p>'
     html += '</body></html>'
     doc = Parser.fromstring(html)
     p = Parser.getElementsByTag(doc, tag='p')[0]
    def getMetaLang(self, article):
        """\
        Extract content language from meta
        """
        # we have a lang attribute in html
        attr = Parser.getAttribute(article.doc, attr='lang')
        if attr is None:
            # look up for a Content-Language in meta
            items = [
                {'tag': 'meta', 'attr': 'http-equiv', 'value': 'content-language'},
                {'tag': 'meta', 'attr': 'name', 'value': 'lang'}
            ]
            for item in items:
                meta = Parser.getElementsByTag(article.doc, **item)
                if meta:
                    attr = Parser.getAttribute(meta[0], attr='content')
                    break

        if attr:
            value = attr[:2]
            if re.search(RE_LANG, value):
                self.language = value.lower()
                return value.lower()

        return None
Exemple #6
0
 def getH1(self, article):
     """\
     Fetch the article H1 tag
     """
     """\
     Searching last h1 tag before main article text begins
     """
     h1 = ''
     if article.topNode is not None:
         lastTag = ''
         for i in article.doc.cssselect('[rel=topnode], h1'):
             if i.tag != 'h1':
                 break
             lastTag = i
         	h1 = Parser.getText(lastTag)
         
         # H1 into main article
         if lastTag == '':
             for i in article.doc.cssselect('[rel=topnode], h1'):
                 if i.tag == 'h1':
                     lastTag = i
                     break
     else:
         # Get first H1 tag
         h1Elem = Parser.getElementsByTag(article.doc, tag='h1')
         """ no h1 found """
         if h1Elem is None or len(h1Elem) == 0:
             return h1
         h1 = Parser.getText(h1Elem[0])
     return h1
Exemple #7
0
    def postExtractionCleanup(self, targetNode):
        """\
        remove any divs that looks like non-content,
        clusters of links, or paras with no gusto
        """
        if targetNode.text is not None:
           e = Parser.createElement(text=targetNode.text)
           targetNode.text = None
           targetNode.insert(0, e)

        node = self.addSiblings(targetNode)

        for e in node:
            if e.tag in ('h2','h3','h4'): continue
            if e.tag not in ('p','pre','font'):
                textLen,stopCount,isHighLink = self.getTextStats(e)
                if isHighLink \
                    or self.isTableTagAndNoParagraphsExist(e) \
                    or not self.isNodeScoreThreshholdMet(node, e):
                    Parser.remove(e)

        for e in reversed(node):
            if e.tag not in ('h2','h3','h4'): break
            Parser.remove(e)

        return node
 def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs):
     """\
     adds any siblings that may have a decent score to this node
     """
     if currentSibling.tag == 'p' and len(Parser.getText(currentSibling)) > 0:
         e0 = currentSibling
         if e0.tail:
             e0 = deepcopy(e0)
             e0.tail = ''
         return [e0]
     else:
         potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p')
         if potentialParagraphs is None:
             return None
         else:
             ps = []
             for firstParagraph in potentialParagraphs:
                 text = Parser.getText(firstParagraph)
                 if len(text) > 0:
                     wordStats = self.stopwordsCls(language=self.language).getStopWordCount(text)
                     paragraphScore = wordStats.getStopWordCount()
                     siblingBaseLineScore = float(.30)
                     highLinkDensity = self.isHighLinkDensity(firstParagraph)
                     score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore)
                     if score < paragraphScore and not highLinkDensity:
                         p = Parser.createElement(tag='p', text=text, tail=None)
                         ps.append(p)
             return ps
Exemple #9
0
 def get_siblings_content(self, current_sibling, baselinescore_siblings_para):
     """\
     adds any siblings that may have a decent score to this node
     """
     if current_sibling.tag == 'p' and len(Parser.getText(current_sibling)) > 0:
         e0 = current_sibling
         if e0.tail:
             e0 = deepcopy(e0)
             e0.tail = ''
         return [e0]
     else:
         potential_paragraphs = Parser.getElementsByTag(current_sibling, tag='p')
         if potential_paragraphs is None:
             return None
         else:
             ps = []
             for first_paragraph in potential_paragraphs:
                 text = Parser.getText(first_paragraph)
                 if len(text) > 0:
                     word_stats = self.stopwords_class(language=self.language).get_stopword_count(text)
                     paragraph_score = word_stats.get_stopword_count()
                     sibling_baseline_score = float(.30)
                     high_link_density = self.is_highlink_density(first_paragraph)
                     score = float(baselinescore_siblings_para * sibling_baseline_score)
                     if score < paragraph_score and not high_link_density:
                         p = Parser.createElement(tag='p', text=text, tail=None)
                         ps.append(p)
             return ps
Exemple #10
0
    def get_siblings_score(self, top_node):
        """\
        we could have long articles that have tons of paragraphs
        so if we tried to calculate the base score against
        the total text score of those paragraphs it would be unfair.
        So we need to normalize the score based on the average scoring
        of the paragraphs within the top node.
        For example if our total score of 10 paragraphs was 1000
        but each had an average value of 100 then 100 should be our base.
        """
        base = 100000
        paragraphs_number = 0
        paragraphs_score = 0
        nodes_to_check = Parser.getElementsByTag(top_node, tag='p')

        for node in nodes_to_check:
            text_node = Parser.getText(node)
            word_stats = self.stopwords_class(language=self.language).get_stopword_count(text_node)
            high_link_density = self.is_highlink_density(node)
            if word_stats.get_stopword_count() > 2 and not high_link_density:
                paragraphs_number += 1
                paragraphs_score += word_stats.get_stopword_count()

        if paragraphs_number > 0:
            base = paragraphs_score / paragraphs_number

        return base
Exemple #11
0
    def is_highlink_density(self, e):
        """\
        checks the density of links within a node,
        is there not much text and most of it contains linky shit?
        if so it's no good
        """
        links = Parser.getElementsByTag(e, tag='a')
        if links is None or len(links) == 0:
            return False

        text = Parser.getText(e)
        words = text.split(' ')
        words_number = float(len(words))
        sb = []
        for link in links:
            sb.append(Parser.getText(link))

        linkText = ''.join(sb)
        linkWords = linkText.split(' ')
        numberOfLinkWords = float(len(linkWords))
        numberOfLinks = float(len(links))
        linkDivisor = float(numberOfLinkWords / words_number)
        score = float(linkDivisor * numberOfLinks)
        if score >= 1.0:
            return True
        return False
Exemple #12
0
    def getMetaLang(self, article):
        """\
        Extract content language from meta
        """
        # we have a lang attribute in html
        attr = Parser.getAttribute(article.doc, attr='lang')
        if attr is None:
            # look up for a Content-Language in meta
            items = [{
                'tag': 'meta',
                'attr': 'http-equiv',
                'value': 'content-language'
            }, {
                'tag': 'meta',
                'attr': 'name',
                'value': 'lang'
            }]
            for item in items:
                meta = Parser.getElementsByTag(article.doc, **item)
                if meta:
                    attr = Parser.getAttribute(meta[0], attr='content')
                    break

        if attr:
            value = attr[:2]
            if re.search(RE_LANG, value):
                self.language = value.lower()
                return value.lower()

        return None
Exemple #13
0
 def test_tostring(self):
     html = '<html><body>'
     html += '<p>this is a test <a>link</a> and this is <strong>strong</strong></p>'
     html += '</body></html>'
     doc = Parser.fromstring(html)
     result = Parser.nodeToString(doc)
     self.assertEqual(html, result)
Exemple #14
0
    def get_siblings_score(self, top_node):
        """\
        we could have long articles that have tons of paragraphs
        so if we tried to calculate the base score against
        the total text score of those paragraphs it would be unfair.
        So we need to normalize the score based on the average scoring
        of the paragraphs within the top node.
        For example if our total score of 10 paragraphs was 1000
        but each had an average value of 100 then 100 should be our base.
        """
        base = 100000
        paragraphs_number = 0
        paragraphs_score = 0
        nodes_to_check = Parser.getElementsByTag(top_node, tag='p')

        for node in nodes_to_check:
            text_node = Parser.getText(node)
            word_stats = self.stopwords_class(language=self.language).get_stopword_count(text_node)
            high_link_density = self.is_highlink_density(node)
            if word_stats.get_stopword_count() > 2 and not high_link_density:
                paragraphs_number += 1
                paragraphs_score += word_stats.get_stopword_count()

        if paragraphs_number > 0:
            base = paragraphs_score / paragraphs_number

        return base
 def convertDivsToParagraphs(self, doc, domType):
     badDivs = 0
     elseDivs = 0
     convertedTextNodes = 0
     divs = Parser.getElementsByTag(doc, tag=domType)
     replaceNodesList = {}
     
     divIndex = 0
     errors = []
     goods = []
     regexps = []
     selectors = []
     tags = ['a','blockquote','dl','div','img','ol','p','pre','table','ul']
     
     for div in divs:
         items = Parser.getElementsByTags(div, tags)
         if div is not None and len(items) == 0:
             self.replaceElementsWithPara(doc, div)
             badDivs += 1
         elif div is not None:
             replaceNodes = self.getReplacementNodes(doc, div)
             div.clear()
 
             for c, n in enumerate(replaceNodes):
                 div.insert(c, n)
             
             elseDivs +=1
     
     return doc
Exemple #16
0
 def getH1(self, article):
     """\
     Fetch the article H1 tag
     """
     """\
     Searching last h1 tag before main article text begins
     """
     h1 = ''
     if article.topNode is not None:
         lastTag = ''
         for i in article.doc.cssselect('[rel=topnode], h1'):
             if i.tag != 'h1':
                 break
             lastTag = i
             h1 = Parser.getText(i)
         if lastTag == '':
             for i in article.doc.cssselect('[rel=topnode] h1'):
                 h1 = Parser.getText(i)
                 break
     else:
         # Get first H1 tag
         h1Elem = article.doc.find('.//h1')
         """ no h1 found """
         if h1Elem is None:
             return h1
         h1 = Parser.getText(h1Elem)
     return h1
 def checkForOpenGraphTag(self):
     """\
     checks to see if we were able to 
     find open graph tags on this page
     """
     node = self.article.rawDoc
     meta = Parser.getElementsByTag(node,
                                    tag='meta',
                                    attr='property',
                                    value='og:image')
     for item in meta:
         href = Parser.getAttribute(item, attr='content')
         if href:
             mainImage = Image()
             mainImage.imageSrc = href
             mainImage.imageExtractionType = "opengraph"
             mainImage.confidenceScore = 100
             locallyStoredImage = self.getLocallyStoredImage(
                 mainImage.imageSrc)
             if locallyStoredImage:
                 mainImage.bytes = locallyStoredImage.bytes
                 mainImage.height = locallyStoredImage.height
                 mainImage.width = locallyStoredImage.width
                 return mainImage
     return None
    def getBaselineScoreForSiblings(self, topNode):
        """\
        we could have long articles that have tons of paragraphs
        so if we tried to calculate the base score against
        the total text score of those paragraphs it would be unfair.
        So we need to normalize the score based on the average scoring
        of the paragraphs within the top node.
        For example if our total score of 10 paragraphs was 1000
        but each had an average value of 100 then 100 should be our base.
        """
        base = 100000
        numberOfParagraphs = 0
        scoreOfParagraphs = 0
        nodesToCheck = Parser.getElementsByTag(topNode, tag='p')

        for node in nodesToCheck:
            nodeText = Parser.getText(node)
            wordStats = self.stopwordsCls(language=self.language).getStopWordCount(nodeText)
            highLinkDensity = self.isHighLinkDensity(node)
            if wordStats.getStopWordCount() > 2 and not highLinkDensity:
                numberOfParagraphs += 1
                scoreOfParagraphs += wordStats.getStopWordCount()

        if numberOfParagraphs > 0:
            base = scoreOfParagraphs / numberOfParagraphs

        return base
 def remove_nodes_regex(self, doc, pattern):
     for selector in ['id', 'class']:
         reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
         naughty_list = doc.xpath(reg, namespaces={'re': self.regexp_namespace})
         for node in naughty_list:
             Parser.remove(node)
     return doc
 def checkForLinkTag(self):
     """\
     checks to see if we were able to 
     find open link_src on this page
     """
     node = self.article.rawDoc
     meta = Parser.getElementsByTag(node,
                                    tag='link',
                                    attr='rel',
                                    value='image_src')
     for item in meta:
         href = Parser.getAttribute(item, attr='href')
         if href:
             mainImage = Image()
             mainImage.imageSrc = href
             mainImage.imageExtractionType = "linktag"
             mainImage.confidenceScore = 100
             locallyStoredImage = self.getLocallyStoredImage(
                 mainImage.imageSrc)
             if locallyStoredImage:
                 mainImage.bytes = locallyStoredImage.bytes
                 mainImage.height = locallyStoredImage.height
                 mainImage.width = locallyStoredImage.width
                 return mainImage
     return None
 def clean_em_tags(self, doc):
     ems = Parser.getElementsByTag(doc, tag='em')
     for node in ems:
         images = Parser.getElementsByTag(node, tag='img')
         if len(images) == 0:
             node.drop_tag()
     return doc
Exemple #22
0
    def getBaselineScoreForSiblings(self, topNode):
        """\
        we could have long articles that have tons of paragraphs
        so if we tried to calculate the base score against
        the total text score of those paragraphs it would be unfair.
        So we need to normalize the score based on the average scoring
        of the paragraphs within the top node.
        For example if our total score of 10 paragraphs was 1000
        but each had an average value of 100 then 100 should be our base.
        """
        base = 100000
        numberOfParagraphs = 0
        scoreOfParagraphs = 0
        nodesToCheck = Parser.getElementsByTag(topNode, tag='p')

        for node in nodesToCheck:
            nodeText = Parser.getText(node)
            wordStats = StopWords(
                language=self.language).getStopWordCount(nodeText)
            highLinkDensity = self.isHighLinkDensity(node)
            if wordStats.getStopWordCount() > 2 and not highLinkDensity:
                numberOfParagraphs += 1
                scoreOfParagraphs += wordStats.getStopWordCount()

        if numberOfParagraphs > 0:
            base = scoreOfParagraphs / numberOfParagraphs

        return base
Exemple #23
0
 def removeNodesViaRegEx(self, doc, pattern):
     for selector in ['id', 'class']:
         reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
         naughtyList = cache.xpath(reg, doc, namespaces={'re':self.regexpNS})
         for node in naughtyList:
             Parser.remove(node)
     return doc
Exemple #24
0
 def cleanEmTags(self, doc):
     ems = Parser.getElementsByTag(doc, tag='em')
     for node in ems:
         images = Parser.getElementsByTag(node, tag='img')
         if len(images) == 0:
             node.drop_tag()
     return doc
Exemple #25
0
    def isHighLinkDensity(self, e):
        """\
        checks the density of links within a node,
        is there not much text and most of it contains linky shit?
        if so it's no good
        """
        links = Parser.getElementsByTag(e, tag='a')
        if links is None or len(links) == 0:
            return False

        text = Parser.getText(e)
        words = text.split(' ')
        numberOfWords = float(len(words))
        sb = []
        for link in links:
            sb.append(Parser.getText(link))

        linkText = ''.join(sb)
        linkWords = linkText.split(' ')
        numberOfLinkWords = float(len(linkWords))
        numberOfLinks = float(len(links))
        linkDivisor = float(numberOfLinkWords / numberOfWords)
        score = float(linkDivisor * numberOfLinks)
        if score >= 1.0:
            return True
        return False
Exemple #26
0
 def cleanEmTags(self, doc):
     ems = Parser.getElementsByTag(doc, tag="em")
     for node in ems:
         images = Parser.getElementsByTag(node, tag="img")
         if len(images) == 0:
             node.drop_tag()
     return doc
Exemple #27
0
 def get_siblings_content(self, current_sibling, baselinescore_siblings_para):
     """\
     adds any siblings that may have a decent score to this node
     """
     if current_sibling.tag == 'p' and len(Parser.getText(current_sibling)) > 0:
         e0 = current_sibling
         if e0.tail:
             e0 = deepcopy(e0)
             e0.tail = ''
         return [e0]
     else:
         potential_paragraphs = Parser.getElementsByTag(current_sibling, tag='p')
         if potential_paragraphs is None:
             return None
         else:
             ps = []
             for first_paragraph in potential_paragraphs:
                 text = Parser.getText(first_paragraph)
                 if len(text) > 0:
                     word_stats = self.stopwords_class(language=self.language).get_stopword_count(text)
                     paragraph_score = word_stats.get_stopword_count()
                     sibling_baseline_score = float(.30)
                     high_link_density = self.is_highlink_density(first_paragraph)
                     score = float(baselinescore_siblings_para * sibling_baseline_score)
                     if score < paragraph_score and not high_link_density:
                         p = Parser.createElement(tag='p', text=text, tail=None)
                         ps.append(p)
             return ps
Exemple #28
0
    def clean(self, article):
        docToClean = article.doc
        nodelist = self.getNodesToDelete(docToClean)
        for node in nodelist: Parser.remove(node)

        docToClean = self.removeListsWithLinks(docToClean)
        docToClean = self.convertDivsToParagraphs(docToClean, ('div','dl','article'))
        return docToClean
Exemple #29
0
 def remove_nodes_regex(self, doc, pattern):
     for selector in ['id', 'class']:
         reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
         naughty_list = doc.xpath(reg,
                                  namespaces={'re': self.regexp_namespace})
         for node in naughty_list:
             Parser.remove(node)
     return doc
Exemple #30
0
 def walk_siblings(self, node):
     current_sibling = Parser.previousSibling(node)
     b = []
     while current_sibling is not None:
         b.append(current_sibling)
         previousSibling = Parser.previousSibling(current_sibling)
         current_sibling = None if previousSibling is None else previousSibling
     return b
Exemple #31
0
 def dropTags(self, doc, tags):
     for tag in tags:
         ems = Parser.getElementsByTag(doc, tag=tag)
         for node in ems:
             images = Parser.getElementsByTag(node, tag='img')
             if len(images) == 0:
                 node.drop_tag()
     return doc
Exemple #32
0
 def walkSiblings(self, node):
     currentSibling = Parser.previousSibling(node)
     b = []
     while currentSibling is not None:
         b.append(currentSibling)
         previousSibling = Parser.previousSibling(currentSibling)
         currentSibling = None if previousSibling is None else previousSibling
     return b
Exemple #33
0
 def replaceTagsWithText(self):
     """\
     replace common tags with just 
     text so we don't have any crazy formatting issues
     so replace <br>, <i>, <strong>, etc.... 
     with whatever text is inside them
     code : http://lxml.de/api/lxml.etree-module.html#strip_tags
     """
     Parser.stripTags(self.getTopNode(), 'b', 'strong', 'i', 'br')
 def replaceTagsWithText(self):
     """\
     replace common tags with just 
     text so we don't have any crazy formatting issues
     so replace <br>, <i>, <strong>, etc.... 
     with whatever text is inside them
     code : http://lxml.de/api/lxml.etree-module.html#strip_tags
     """
     Parser.stripTags(self.getTopNode(), 'b', 'strong', 'i', 'br')
 def convertToText(self,article):
     text = Parser.getFormattedText(self.topNode)
     lines = text.split(u'\n')
     good_lines = []
     for line in lines:
         if re.search('[^ \xa0]',line): good_lines.append(line.strip())
     text = u'\n'.join(good_lines)
     Parser.adjustTopNode(article)
     return text
Exemple #36
0
 def removeNodesViaRegEx(self, doc, pattern):
     for selector in ['id', 'class']:
         reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
         naughtyList = cache.xpath(reg,
                                   doc,
                                   namespaces={'re': self.regexpNS})
         for node in naughtyList:
             Parser.remove(node)
     return doc
Exemple #37
0
    def isTableTagAndNoParagraphsExist(self, e):
        return False
        subParagraphs = Parser.getElementsByTag(e, tag='p')
        for p in subParagraphs:
            txt = Parser.getText(p)
            if len(txt) < 25:
                Parser.remove(p)

        if not Parser.hasChildTag(e, 'p') and e.tag is not "td":
            return True
        return False
Exemple #38
0
    def is_table_and_no_para_exist(self, e):
        subParagraphs = Parser.getElementsByTag(e, tag='p')
        for p in subParagraphs:
            txt = Parser.getText(p)
            if len(txt) < 25:
                Parser.remove(p)

        subParagraphs2 = Parser.getElementsByTag(e, tag='p')
        if len(subParagraphs2) == 0 and e.tag is not "td":
            return True
        return False
Exemple #39
0
    def isTableTagAndNoParagraphsExist(self, e):
        subParagraphs = Parser.getElementsByTag(e, tag='p')
        for p in subParagraphs:
            txt = Parser.getText(p)
            if len(txt) < 25:
                Parser.remove(p)

        subParagraphs2 = Parser.getElementsByTag(e, tag='p')
        if len(subParagraphs2) == 0 and e.tag is not "td":
            return True
        return False
Exemple #40
0
 def test_striptags(self):
     html = '<html><body>'
     html += '<p>this is a test <a>link</a> and this is <strong>strong</strong></p>'
     html += '</body></html>'
     expected = '<html><body>'
     expected += '<p>this is a test link and this is strong</p>'
     expected += '</body></html>'
     doc = Parser.fromstring(html)
     Parser.stripTags(doc, 'a', 'strong')
     result = Parser.nodeToString(doc)
     self.assertEqual(expected, result)
 def removeNodesWithNegativeScores(self):
     """\
     if there are elements inside our top node
     that have a negative gravity score,
     let's give em the boot
     """
     return
     gravityItems = self.topNode.cssselect("*[gravityScore]")
     for item in gravityItems:
         score = int(item.attrib.get('gravityScore'), 0)
         if score < 1:
             Parser.remove(item)
Exemple #42
0
 def post_cleanup(self, targetNode):
     """\
     remove any divs that looks like non-content,
     clusters of links, or paras with no gusto
     """
     node = self.add_siblings(targetNode)
     for e in node.getchildren():
         if e.tag != 'p':
             if self.is_highlink_density(e) \
                 or self.is_table_and_no_para_exist(e) \
                 or not self.is_nodescore_threshold_met(node, e):
                 Parser.remove(e)
     return node
Exemple #43
0
 def post_cleanup(self, targetNode):
     """\
     remove any divs that looks like non-content,
     clusters of links, or paras with no gusto
     """
     node = self.add_siblings(targetNode)
     for e in node.getchildren():
         if e.tag != 'p':
             if self.is_highlink_density(e) \
                 or self.is_table_and_no_para_exist(e) \
                 or not self.is_nodescore_threshold_met(node, e):
                 Parser.remove(e)
     return node
Exemple #44
0
    def test_getElementsByTags(self):
        html = '<html><body>'
        html += '<p>this is a test <a class="link">link</a> and this is <strong class="link">strong</strong></p>'
        html += '<p>this is a test and this is <strong class="link">strong</strong></p>'
        html += '</body></html>'
        doc = Parser.fromstring(html)
        elements = Parser.getElementsByTags(doc, ['p', 'a', 'strong'])
        self.assertEqual(len(elements), 5)

        # find childs within the first p
        p = Parser.getElementsByTag(doc, tag='p')[0]
        elements = Parser.getElementsByTags(p, ['p', 'a', 'strong'])
        self.assertEqual(len(elements), 2)
Exemple #45
0
 def postExtractionCleanup(self, targetNode):
     """\
     remove any divs that looks like non-content,
     clusters of links, or paras with no gusto
     """
     node = self.addSiblings(targetNode)
     for e in node.getchildren():
         if e.tag != 'p':
             if self.isHighLinkDensity(e) \
                 or self.isTableTagAndNoParagraphsExist(e) \
                 or not self.isNodeScoreThreshholdMet(node, e):
                 Parser.remove(e)
     return node
 def getDepthLevel(self, node, parentDepth, siblingDepth):
     MAX_PARENT_DEPTH = 2
     if parentDepth > MAX_PARENT_DEPTH:
         return None
     else:
         siblingNode = Parser.previousSibling(node)
         if siblingNode is not None:
             return DepthTraversal(siblingNode, parentDepth,
                                   siblingDepth + 1)
         elif node is not None:
             parent = Parser.getParent(node)
             if parent is not None:
                 return DepthTraversal(parent, parentDepth + 1, 0)
     return None
Exemple #47
0
    def is_boostable(self, node):
        """\
        alot of times the first paragraph might be the caption under an image
        so we'll want to make sure if we're going to boost a parent node that
        it should be connected to other paragraphs,
        at least for the first n paragraphs so we'll want to make sure that
        the next sibling is a paragraph and has at
        least some substatial weight to it
        """
        para = "p"
        steps_away = 0
        minimum_stopword_count = 5
        max_stepsaway_from_node = 3

        nodes = self.walk_siblings(node)
        for current_node in nodes:
            # p
            if current_node.tag == para:
                if steps_away >= max_stepsaway_from_node:
                    return False
                paraText = Parser.getText(current_node)
                word_stats = self.stopwords_class(language=self.language).get_stopword_count(paraText)
                if word_stats.get_stopword_count() > minimum_stopword_count:
                    return True
                steps_away += 1
        return False
Exemple #48
0
    def remove_drop_caps(self, doc):
        items = Parser.css_select(
            doc, "span[class~=dropcap], span[class~=drop_cap]")
        for item in items:
            item.drop_tag()

        return doc
Exemple #49
0
    def isOkToBoost(self, node):
        """\
        alot of times the first paragraph might be the caption under an image
        so we'll want to make sure if we're going to boost a parent node that
        it should be connected to other paragraphs,
        at least for the first n paragraphs so we'll want to make sure that
        the next sibling is a paragraph and has at
        least some substatial weight to it
        """
        para = "p"
        stepsAway = 0
        minimumStopWordCount = 5
        maxStepsAwayFromNode = 3

        nodes = self.walkSiblings(node)
        for currentNode in nodes:
            # p
            if currentNode.tag == para:
                if stepsAway >= maxStepsAwayFromNode:
                    return False
                paraText = Parser.getText(currentNode)
                wordStats = StopWords(
                    language=self.language).getStopWordCount(paraText)
                if wordStats.getStopWordCount > minimumStopWordCount:
                    return True
                stepsAway += 1
        return False
Exemple #50
0
 def convertToText(self):
     txts = []
     for node in list(self.getTopNode()):
         txt = Parser.getText(node)
         if txt:
             txt = HTMLParser().unescape(txt)
             txts.append(innerTrim(txt))
     return '\n\n'.join(txts)
Exemple #51
0
    def extract_tags(self, article):
        node = article.doc

        # node doesn't have chidren
        if len(list(node)) == 0:
            return NO_STRINGS

        elements = Parser.css_select(node, A_REL_TAG_SELECTOR)
        if elements is None:
            return NO_STRINGS

        tags = []
        for el in elements:
            tag = Parser.getText(el)
            if tag:
                tags.append(tag)

        return set(tags)
    def checkForKnownElements(self):
        """\
        in here we check for known image contains from sites
        we've checked out like yahoo, techcrunch, etc... that have
        * known  places to look for good images.
        * TODO: enable this to use a series of settings files
          so people can define what the image ids/classes
          are on specific sites
        """
        domain = self.getCleanDomain()
        if domain in self.customSiteMapping.keys():
            classes = self.customSiteMapping.get(domain).split('|')
            for classname in classes:
                KNOWN_IMG_DOM_NAMES.append(classname)

        knownImage = None

        for knownName in KNOWN_IMG_DOM_NAMES:
            known = Parser.getElementById(self.article.rawDoc, knownName)
            if not known:
                known = Parser.getElementsByTag(self.article.rawDoc,
                                                attr='class',
                                                value=knownName)
                if known:
                    known = known[0]
            if known:
                mainImage = Parser.getElementsByTag(known, tag='img')
                if mainImage:
                    knownImage = mainImage[0]

        if knownImage is not None:
            knownImgSrc = Parser.getAttribute(knownImage, attr='src')
            mainImage = Image()
            mainImage.imageSrc = self.buildImagePath(knownImgSrc)
            mainImage.imageExtractionType = "known"
            mainImage.confidenceScore = 90
            locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc)
            if locallyStoredImage:
                mainImage.bytes = locallyStoredImage.bytes
                mainImage.height = locallyStoredImage.height
                mainImage.width = locallyStoredImage.width

            return mainImage
Exemple #53
0
 def getNodesToCheck(self, doc):
     """\
     returns a list of nodes we want to search
     on like paragraphs and tables
     """
     nodesToCheck = []
     for tag in ['p', 'pre', 'td']:
         items = Parser.getElementsByTag(doc, tag=tag)
         nodesToCheck += items
     return nodesToCheck
 def remove_negativescores_nodes(self):
     """\
     if there are elements inside our top node
     that have a negative gravity score,
     let's give em the boot
     """
     gravity_items = Parser.css_select(self.top_node, "*[gravityScore]")
     for item in gravity_items:
         score = int(item.attrib.get('gravityScore'), 0)
         if score < 1:
             item.getparent().remove(item)
Exemple #55
0
    def removeScriptsAndStyles(self, doc):
        # remove scripts
        scripts = Parser.getElementsByTag(doc, tag='script')
        for item in scripts:
            Parser.remove(item)

        # remove styles
        styles = Parser.getElementsByTag(doc, tag='style')
        for item in styles:
            Parser.remove(item)

        # remove comments
        comments = Parser.getComments(doc)
        for item in comments:
            Parser.remove(item)

        return doc
Exemple #56
0
    def getTitle(self, article):
        """\
        Fetch the article title and analyze it
        """

        title = ''
        doc = article.doc

        titleElem = Parser.getElementsByTag(doc, tag='title')
        # no title found
        if titleElem is None or len(titleElem) == 0:
            return title

        # title elem found
        titleText = Parser.getText(titleElem[0])
        usedDelimeter = False

        # split title with |
        if '|' in titleText:
            titleText = self.doTitleSplits(titleText, PIPE_SPLITTER)
            usedDelimeter = True

        # split title with -
        if not usedDelimeter and '-' in titleText:
            titleText = self.doTitleSplits(titleText, DASH_SPLITTER)
            usedDelimeter = True

        # split title with »
        if not usedDelimeter and u'»' in titleText:
            titleText = self.doTitleSplits(titleText, ARROWS_SPLITTER)
            usedDelimeter = True

        # split title with :
        if not usedDelimeter and ':' in titleText:
            titleText = self.doTitleSplits(titleText, COLON_SPLITTER)
            usedDelimeter = True

        title = MOTLEY_REPLACEMENT.replaceAll(titleText)
        return title