Python Parser.getAttribute Examples, goose.parsers.Parser.getAttribute Python Examples

Example #1

0

Show file

File: extractors.py Project: mattgrayson/python-goose

    def getMetaLang(self, article):
        """\
        Extract content language from meta
        """
        # we have a lang attribute in html
        attr = Parser.getAttribute(article.doc, attr='lang')
        if attr is None:
            # look up for a Content-Language in meta
            items = [
                {'tag': 'meta', 'attr': 'http-equiv', 'value': 'content-language'},
                {'tag': 'meta', 'attr': 'name', 'value': 'lang'}
            ]
            for item in items:
                meta = Parser.getElementsByTag(article.doc, **item)
                if meta:
                    attr = Parser.getAttribute(meta[0], attr='content')
                    break

        if attr:
            value = attr[:2]
            if re.search(RE_LANG, value):
                self.language = value.lower()
                return value.lower()

        return None

Example #2

0

Show file

File: cleaners.py Project: BigData-Tools/python-goose

    def get_replacement_nodes(self, doc, div):
        replacement_text = []
        nodes_to_return = []
        nodes_to_remove = []
        childs = Parser.childNodesWithText(div)

        for kid in childs:
            # node is a p
            # and already have some replacement text
            if Parser.getTag(kid) == 'p' and len(replacement_text) > 0:
                newNode = self.get_flushed_buffer(''.join(replacement_text), doc)
                nodes_to_return.append(newNode)
                replacement_text = []
                nodes_to_return.append(kid)
            # node is a text node
            elif Parser.isTextNode(kid):
                kid_text_node = kid
                kid_text = Parser.getText(kid)
                replace_text = self.tablines_replacements.replaceAll(kid_text)
                if(len(replace_text)) > 1:
                    previous_sibling_node = Parser.previousSibling(kid_text_node)
                    while previous_sibling_node is not None \
                        and Parser.getTag(previous_sibling_node) == "a" \
                        and Parser.getAttribute(previous_sibling_node, 'grv-usedalready') != 'yes':
                        outer = " " + Parser.outerHtml(previous_sibling_node) + " "
                        replacement_text.append(outer)
                        nodes_to_remove.append(previous_sibling_node)
                        Parser.setAttribute(previous_sibling_node,
                                    attr='grv-usedalready', value='yes')
                        prev = Parser.previousSibling(previous_sibling_node)
                        previous_sibling_node = prev if prev is not None else None
                    # append replace_text
                    replacement_text.append(replace_text)
                    #
                    next_sibling_node = Parser.nextSibling(kid_text_node)
                    while next_sibling_node is not None \
                        and Parser.getTag(next_sibling_node) == "a" \
                        and Parser.getAttribute(next_sibling_node, 'grv-usedalready') != 'yes':
                        outer = " " + Parser.outerHtml(next_sibling_node) + " "
                        replacement_text.append(outer)
                        nodes_to_remove.append(next_sibling_node)
                        Parser.setAttribute(next_sibling_node,
                                    attr='grv-usedalready', value='yes')
                        next = Parser.nextSibling(next_sibling_node)
                        previous_sibling_node = next if next is not None else None

            # otherwise
            else:
                nodes_to_return.append(kid)

        # flush out anything still remaining
        if(len(replacement_text) > 0):
            new_node = self.get_flushed_buffer(''.join(replacement_text), doc)
            nodes_to_return.append(new_node)
            replacement_text = []

        for n in nodes_to_remove:
            Parser.remove(n)

        return nodes_to_return

Example #3

0

Show file

    def getMetaLang(self, article):
        """\
        Extract content language from meta
        """
        # we have a lang attribute in html
        attr = Parser.getAttribute(article.doc, attr='lang')
        if attr is None:
            # look up for a Content-Language in meta
            items = [{
                'tag': 'meta',
                'attr': 'http-equiv',
                'value': 'content-language'
            }, {
                'tag': 'meta',
                'attr': 'name',
                'value': 'lang'
            }]
            for item in items:
                meta = Parser.getElementsByTag(article.doc, **item)
                if meta:
                    attr = Parser.getAttribute(meta[0], attr='content')
                    break

        if attr:
            value = attr[:2]
            if re.search(RE_LANG, value):
                self.language = value.lower()
                return value.lower()

        return None

Example #4

0

Show file

File: UpgradedImageExtractor.py Project: toddwilson/python-goose

 def checkForOpenGraphTag(self):
     """\
     checks to see if we were able to 
     find open graph tags on this page
     """
     node = self.article.rawDoc
     meta = Parser.getElementsByTag(node,
                                    tag='meta',
                                    attr='property',
                                    value='og:image')
     for item in meta:
         href = Parser.getAttribute(item, attr='content')
         if href:
             mainImage = Image()
             mainImage.imageSrc = href
             mainImage.imageExtractionType = "opengraph"
             mainImage.confidenceScore = 100
             locallyStoredImage = self.getLocallyStoredImage(
                 mainImage.imageSrc)
             if locallyStoredImage:
                 mainImage.bytes = locallyStoredImage.bytes
                 mainImage.height = locallyStoredImage.height
                 mainImage.width = locallyStoredImage.width
                 return mainImage
     return None

Example #5

0

Show file

File: extractors.py Project: iKalin/python-goose

    def getMetaLang(self, article):
        """\
        Extract content languages from metas
        """
        # we have a lang attribute in html
        meta_langs = []
        attr = Parser.getAttribute(article.doc, attr='lang')
        if attr is not None: meta_langs += attr.replace(' ','').lower().split(',')
        # look up for a Content-Language in meta
        attrs = {
            'http-equiv':'content-language',
            'name':'lang',
            'name':'og:lang',
        }
        head = article.doc.find('head')
        if head is not None:
            metas = Parser.getElementsByTag(head, tag='meta')
            for meta in metas:
                for attr in attrs:
                    if meta.attrib.get(attr,'').lower().startswith(attrs[attr]):
                        langs = meta.attrib.get('content',None)
                        if langs is not None: meta_langs += langs.replace(' ','').lower().split(',')
                if 'lang' in meta.attrib: meta_langs += meta.attrib['lang'].replace(' ','').lower().split(',')

        result = []
        for lang in meta_langs:
            lang = lang[:2]
            if re.search(RE_LANG, lang):
                result.append(lang)

        return result

Example #6

0

Show file

File: UpgradedImageExtractor.py Project: toddwilson/python-goose

 def checkForLinkTag(self):
     """\
     checks to see if we were able to 
     find open link_src on this page
     """
     node = self.article.rawDoc
     meta = Parser.getElementsByTag(node,
                                    tag='link',
                                    attr='rel',
                                    value='image_src')
     for item in meta:
         href = Parser.getAttribute(item, attr='href')
         if href:
             mainImage = Image()
             mainImage.imageSrc = href
             mainImage.imageExtractionType = "linktag"
             mainImage.confidenceScore = 100
             locallyStoredImage = self.getLocallyStoredImage(
                 mainImage.imageSrc)
             if locallyStoredImage:
                 mainImage.bytes = locallyStoredImage.bytes
                 mainImage.height = locallyStoredImage.height
                 mainImage.width = locallyStoredImage.width
                 return mainImage
     return None

Example #7

0

Show file

File: __init__.py Project: AnthonyNystrom/python-goose

 def getMetaLang(self, article):
     """\
     Extract content language from meta
     """
     # we have a lang attribute in html
     attr = Parser.getAttribute(article.doc, attr='lang')
     if attr is None:
         # look up for a Content-Language in meta
         kwargs = {'tag':'meta',
                     'attr':' http-equiv',
                     'value':'content-language'}
         meta = Parser.getElementsByTag(article.doc, **kwargs)
         if meta:
             attr = Parser.getAttribute(meta[0], attr='content')
     
     if attr:
         value = attr[:2]
         if re.search(RE_LANG, value):
             return value.lower()
     
     return None

Example #8

0

Show file

File: UpgradedImageExtractor.py Project: angelzou/python-goose

    def isOkImageFileName(self, imageNode):
        """\
        will check the image src against a list
        of bad image files we know of like buttons, etc...
        """
        imgSrc = Parser.getAttribute(imageNode, attr='src')

        if not imgSrc:
            return False

        if self.matchBadImageNames.search(imgSrc):
            return False

        return True

Example #9

0

Show file

File: extractors.py Project: BigData-Tools/python-goose

    def is_valid_filename(self, imageNode):
        """\
        will check the image src against a list
        of bad image files we know of like buttons, etc...
        """
        src = Parser.getAttribute(imageNode, attr='src')

        if not src:
            return False

        if self.badimages_names_re.search(src):
            return False

        return True

Example #10

0

Show file

File: UpgradedImageExtractor.py Project: toddwilson/python-goose

    def isOkImageFileName(self, imageNode):
        """\
        will check the image src against a list 
        of bad image files we know of like buttons, etc...
        """
        imgSrc = Parser.getAttribute(imageNode, attr='src')

        if not imgSrc:
            return False

        if self.matchBadImageNames.search(imgSrc):
            return False

        return True

Example #11

0

Show file

    def getMetaLang(self, article):
        """\
        Extract content language from meta
        """
        # we have a lang attribute in html
        attr = Parser.getAttribute(article.doc, attr='lang')
        if attr is None:
            # look up for a Content-Language in meta
            kwargs = {
                'tag': 'meta',
                'attr': ' http-equiv',
                'value': 'content-language'
            }
            meta = Parser.getElementsByTag(article.doc, **kwargs)
            if meta:
                attr = Parser.getAttribute(meta[0], attr='content')

        if attr:
            value = attr[:2]
            if re.search(RE_LANG, value):
                return value.lower()

        return None

Example #12

0

Show file

File: UpgradedImageExtractor.py Project: toddwilson/python-goose

    def downloadImagesAndGetResults(self, images, depthLevel):
        """\
        download the images to temp disk and set their dimensions
        - we're going to score the images in the order in which 
          they appear so images higher up will have more importance,
        - we'll count the area of the 1st image as a score 
          of 1 and then calculate how much larger or small each image after it is
        - we'll also make sure to try and weed out banner 
          type ad blocks that have big widths and small heights or vice versa
        - so if the image is 3rd found in the dom it's 
          sequence score would be 1 / 3 = .33 * diff 
          in area from the first image
        """
        imageResults = {}
        initialArea = float(0.0)
        totalScore = float(0.0)
        cnt = float(1.0)
        MIN_WIDTH = 50
        for image in images[:30]:
            imgSrc = Parser.getAttribute(image, attr='src')
            imgSrc = self.buildImagePath(imgSrc)
            locallyStoredImage = self.getLocallyStoredImage(imgSrc)
            width = locallyStoredImage.width
            height = locallyStoredImage.height
            imageSrc = locallyStoredImage.imgSrc
            fileExtension = locallyStoredImage.fileExtension

            if fileExtension != '.gif' or fileExtension != 'NA':
                if (depthLevel >= 1
                        and locallyStoredImage.width > 300) or depthLevel < 1:
                    if not self.isBannerDimensions(width, height):
                        if width > MIN_WIDTH:
                            sequenceScore = float(1.0 / cnt)
                            area = float(width * height)
                            totalScore = float(0.0)

                            if initialArea == 0:
                                initialArea = area * float(1.48)
                                totalScore = 1
                            else:
                                areaDifference = float(area / initialArea)
                                totalScore = sequenceScore * areaDifference

                            imageResults.update(
                                {locallyStoredImage: totalScore})
                            cnt += 1
                            cnt += 1
        return imageResults

Example #13

0

Show file

File: extractors.py Project: BigData-Tools/python-goose

    def fetch_images(self, images, depth_level):
        """\
        download the images to temp disk and set their dimensions
        - we're going to score the images in the order in which
          they appear so images higher up will have more importance,
        - we'll count the area of the 1st image as a score
          of 1 and then calculate how much larger or small each image after it is
        - we'll also make sure to try and weed out banner
          type ad blocks that have big widths and small heights or vice versa
        - so if the image is 3rd found in the dom it's
          sequence score would be 1 / 3 = .33 * diff
          in area from the first image
        """
        image_results = {}
        initial_area = float(0.0)
        total_score = float(0.0)
        cnt = float(1.0)
        MIN_WIDTH = 50
        for image in images[:30]:
            src = Parser.getAttribute(image, attr='src')
            src = self.build_image_path(src)
            local_image = self.get_local_image(src)
            width = local_image.width
            height = local_image.height
            src = local_image.src
            file_extension = local_image.file_extension

            if file_extension != '.gif' or file_extension != 'NA':
                if (depth_level >= 1 and local_image.width > 300) or depth_level < 1:
                    if not self.is_banner_dimensions(width, height):
                        if width > MIN_WIDTH:
                            sequence_score = float(1.0 / cnt)
                            area = float(width * height)
                            total_score = float(0.0)

                            if initial_area == 0:
                                initial_area = area * float(1.48)
                                total_score = 1
                            else:
                                area_difference = float(area / initial_area)
                                total_score = sequence_score * area_difference

                            image_results.update({local_image: total_score})
                            cnt += 1
                            cnt += 1
        return image_results

Example #14

0

Show file

File: UpgradedImageExtractor.py Project: evan0/python-goose

    def checkForKnownElements(self):
        """\
        in here we check for known image contains from sites
        we've checked out like yahoo, techcrunch, etc... that have
        * known  places to look for good images.
        * TODO: enable this to use a series of settings files
          so people can define what the image ids/classes
          are on specific sites
        """
        domain = self.getCleanDomain()
        if domain in self.customSiteMapping.keys():
            classes = self.customSiteMapping.get(domain).split('|')
            for classname in classes:
                KNOWN_IMG_DOM_NAMES.append(classname)

        knownImage = None

        for knownName in KNOWN_IMG_DOM_NAMES:
            known = Parser.getElementById(self.article.rawDoc, knownName)
            if not known:
                known = Parser.getElementsByTag(self.article.rawDoc,
                                                attr='class',
                                                value=knownName)
                if known:
                    known = known[0]
            if known:
                mainImage = Parser.getElementsByTag(known, tag='img')
                if mainImage:
                    knownImage = mainImage[0]

        if knownImage is not None:
            knownImgSrc = Parser.getAttribute(knownImage, attr='src')
            mainImage = Image()
            mainImage.imageSrc = self.buildImagePath(knownImgSrc)
            mainImage.imageExtractionType = "known"
            mainImage.confidenceScore = 90
            locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc)
            if locallyStoredImage:
                mainImage.bytes = locallyStoredImage.bytes
                mainImage.height = locallyStoredImage.height
                mainImage.width = locallyStoredImage.width

            return mainImage

Example #15

0

Show file

File: UpgradedImageExtractor.py Project: iKalin/python-goose

    def checkForKnownElements(self):
        """\
        in here we check for known image contains from sites
        we've checked out like yahoo, techcrunch, etc... that have
        * known  places to look for good images.
        * TODO: enable this to use a series of settings files
          so people can define what the image ids/classes
          are on specific sites
        """
        domain = self.getCleanDomain()
        if domain in self.customSiteMapping.keys():
            classes = self.customSiteMapping.get(domain).split('|')
            for classname in classes:
                KNOWN_IMG_DOM_NAMES.append(classname)

        knownImage = None

        for knownName in KNOWN_IMG_DOM_NAMES:
            known = Parser.getElementById(self.article.rawDoc, knownName)
            if known is None:
                known = Parser.getElementsByTag(self.article.rawDoc,
                                                attr='class', value=knownName)
                known = known[0] if known else None

            if known is not None:
                mainImage = Parser.getElementsByTag(known, tag='img')
                if mainImage:
                    knownImage = mainImage[0]

        if knownImage is not None:
            knownImgSrc = Parser.getAttribute(knownImage, attr='src')
            mainImage = Image()
            mainImage.imageSrc = self.buildImagePath(knownImgSrc)
            mainImage.imageExtractionType = "known"
            mainImage.confidenceScore = 90
            locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc)
            if locallyStoredImage:
                mainImage.bytes = locallyStoredImage.bytes
                mainImage.height = locallyStoredImage.height
                mainImage.width = locallyStoredImage.width

            return mainImage

Example #16

0

Show file

File: extractors.py Project: BigData-Tools/python-goose

    def check_known_elements(self):
        """\
        in here we check for known image contains from sites
        we've checked out like yahoo, techcrunch, etc... that have
        * known  places to look for good images.
        * TODO: enable this to use a series of settings files
          so people can define what the image ids/classes
          are on specific sites
        """
        domain = self.get_clean_domain()
        if domain in self.custom_site_mapping.keys():
            classes = self.custom_site_mapping.get(domain).split('|')
            for classname in classes:
                KNOWN_IMG_DOM_NAMES.append(classname)

        known_image = None

        for known_name in KNOWN_IMG_DOM_NAMES:
            known = Parser.getElementById(self.article.raw_doc, known_name)
            if not known:
                known = Parser.getElementsByTag(self.article.raw_doc,
                                                attr='class', value=known_name)
                if known:
                    known = known[0]
            if known:
                main_image = Parser.getElementsByTag(known, tag='img')
                if main_image:
                    known_image = main_image[0]

        if known_image is not None:
            known_image_source = Parser.getAttribute(known_image, attr='src')
            main_image = Image()
            main_image.src = self.build_image_path(known_image_source)
            main_image.extraction_type = "known"
            main_image.confidence_score = 90
            local_image = self.get_local_image(main_image.src)
            if local_image:
                main_image.bytes = local_image.bytes
                main_image.height = local_image.height
                main_image.width = local_image.width

            return main_image

Example #17

0

Show file

File: extractors.py Project: BigData-Tools/python-goose

 def check_link_tag(self):
     """\
     checks to see if we were able to
     find open link_src on this page
     """
     node = self.article.raw_doc
     meta = Parser.getElementsByTag(node, tag='link', attr='rel', value='image_src')
     for item in meta:
         href = Parser.getAttribute(item, attr='href')
         if href:
             main_image = Image()
             main_image.src = href
             main_image.extraction_type = "linktag"
             main_image.confidence_score = 100
             local_image = self.get_local_image(main_image.src)
             if local_image:
                 main_image.bytes = local_image.bytes
                 main_image.height = local_image.height
                 main_image.width = local_image.width
                 return main_image
     return None

Example #18

0

Show file

File: extractors.py Project: BigData-Tools/python-goose

 def check_opengraph_tag(self):
     """\
     checks to see if we were able to
     find open graph tags on this page
     """
     node = self.article.raw_doc
     meta = Parser.getElementsByTag(node, tag='meta', attr='property', value='og:image')
     for item in meta:
         href = Parser.getAttribute(item, attr='content')
         if href:
             main_image = Image()
             main_image.src = href
             main_image.extraction_type = "opengraph"
             main_image.confidence_score = 100
             local_image = self.get_local_image(main_image.src)
             if local_image:
                 main_image.bytes = local_image.bytes
                 main_image.height = local_image.height
                 main_image.width = local_image.width
                 return main_image
     return None

Example #19

0

Show file

File: UpgradedImageExtractor.py Project: angelzou/python-goose

 def findImagesThatPassByteSizeTest(self, images):
     """\
     loop through all the images and find the ones
     that have the best bytez to even make them a candidate
     """
     cnt = 0
     MAX_BYTES_SIZE = 15728640
     goodImages = []
     for image in images:
         if cnt > 30:
             return goodImages
         imgSrc = Parser.getAttribute(image, attr='src')
         imgSrc = self.buildImagePath(imgSrc)
         locallyStoredImage = self.getLocallyStoredImage(imgSrc)
         if locallyStoredImage:
             bytes = locallyStoredImage.bytes
             if (bytes == 0 or bytes > self.minBytesForImages) \
                     and bytes < MAX_BYTES_SIZE:
                 goodImages.append(image)
             else:
                 images.remove(image)
         cnt += 1
     return goodImages if len(goodImages) > 0 else None

Example #20

0

Show file

File: extractors.py Project: BigData-Tools/python-goose

 def get_images_bytesize_match(self, images):
     """\
     loop through all the images and find the ones
     that have the best bytez to even make them a candidate
     """
     cnt = 0
     MAX_BYTES_SIZE = 15728640
     good_images = []
     for image in images:
         if cnt > 30:
             return good_images
         src = Parser.getAttribute(image, attr='src')
         src = self.build_image_path(src)
         local_image = self.get_local_image(src)
         if local_image:
             bytes = local_image.bytes
             if (bytes == 0 or bytes > self.images_min_bytes) \
                     and bytes < MAX_BYTES_SIZE:
                 good_images.append(image)
             else:
                 images.remove(image)
         cnt += 1
     return good_images if len(good_images) > 0 else None

Example #21

0

Show file

File: UpgradedImageExtractor.py Project: toddwilson/python-goose

 def findImagesThatPassByteSizeTest(self, images):
     """\
     loop through all the images and find the ones 
     that have the best bytez to even make them a candidate
     """
     cnt = 0
     MAX_BYTES_SIZE = 15728640
     goodImages = []
     for image in images:
         if cnt > 30:
             return goodImages
         imgSrc = Parser.getAttribute(image, attr='src')
         imgSrc = self.buildImagePath(imgSrc)
         locallyStoredImage = self.getLocallyStoredImage(imgSrc)
         if locallyStoredImage:
             bytes = locallyStoredImage.bytes
             if (bytes == 0 or bytes > self.minBytesForImages) \
                     and bytes < MAX_BYTES_SIZE:
                 goodImages.append(image)
             else:
                 images.remove(image)
         cnt += 1
     return goodImages if len(goodImages) > 0 else None

Example #22

0

Show file

File: cleaners.py Project: ilovenwd/python-goose

    def getReplacementNodes(self, doc, div):
        replacementText = []
        nodesToReturn = []
        nodesToRemove = []
        childs = Parser.childNodesWithText(div)

        for kid in childs:
            # node is a p
            # and already have some replacement text
            if Parser.getTag(kid) == 'p' and len(replacementText) > 0:
                newNode = self.getFlushedBuffer(''.join(replacementText), doc)
                nodesToReturn.append(newNode)
                replacementText = []
                nodesToReturn.append(kid)
            # node is a text node
            elif Parser.isTextNode(kid):
                kidTextNode = kid
                kidText = Parser.getText(kid)
                replaceText = self.tabsAndNewLinesReplcesments.replaceAll(kidText)
                if(len(replaceText)) > 0:
                    prevSibNode = Parser.previousSibling(kidTextNode)
                    while prevSibNode is not None \
                        and Parser.getTag(prevSibNode) == "a" \
                        and Parser.getAttribute(prevSibNode, 'grv-usedalready') != 'yes':
                        outer = " " + Parser.outerHtml(prevSibNode) + " "
                        replacementText.append(outer)
                        nodesToRemove.append(prevSibNode)
                        Parser.setAttribute(prevSibNode,
                                    attr='grv-usedalready', value='yes')
                        prevSibNode = Parser.previousSibling(prevSibNode)
                    # append replaceText
                    replacementText.append(replaceText)
                    #
                    nextSibNode = Parser.nextSibling(kidTextNode)
                    while nextSibNode is not None \
                        and Parser.getTag(nextSibNode) == "a" \
                        and Parser.getAttribute(nextSibNode, 'grv-usedalready') != 'yes':
                        outer = " " + Parser.outerHtml(nextSibNode) + " "
                        replacementText.append(outer)
                        nodesToRemove.append(nextSibNode)
                        Parser.setAttribute(nextSibNode,
                                    attr='grv-usedalready', value='yes')
                        prevSibNode = Parser.nextSibling(nextSibNode)

            # otherwise
            else:
                if Parser.getTag(kid) == "a" and Parser.getAttribute(kid, 'grv-usedalready') == 'yes':
                    continue
                if(len(replacementText) > 0):
                    newNode = self.getFlushedBuffer(''.join(replacementText), doc)
                    nodesToReturn.append(newNode)
                    replacementText = []
                nodesToReturn.append(kid)

        # flush out anything still remaining
        if(len(replacementText) > 0):
            newNode = self.getFlushedBuffer(''.join(replacementText), doc)
            nodesToReturn.append(newNode)
            replacementText = []

        for n in nodesToRemove:
            Parser.remove(n)

        return nodesToReturn

Example #23

0

Show file

    def getReplacementNodes(self, doc, div):
        replacementText = []
        nodesToReturn = []
        nodesToRemove = []
        childs = Parser.childNodesWithText(div)

        for kid in childs:
            # node is a p
            # and already have some replacement text
            if Parser.getTag(kid) == 'p' and len(replacementText) > 0:
                newNode = self.getFlushedBuffer(''.join(replacementText), doc)
                nodesToReturn.append(newNode)
                replacementText = []
                nodesToReturn.append(kid)
            # node is a text node
            elif Parser.isTextNode(kid):
                kidTextNode = kid
                kidText = Parser.getText(kid)
                replaceText = self.tabsAndNewLinesReplcesments.replaceAll(
                    kidText)
                if (len(replaceText)) > 1:
                    prevSibNode = Parser.previousSibling(kidTextNode)
                    while prevSibNode is not None \
                        and Parser.getTag(prevSibNode) == "a" \
                        and Parser.getAttribute(prevSibNode, 'grv-usedalready') != 'yes':
                        outer = " " + Parser.outerHtml(prevSibNode) + " "
                        replacementText.append(outer)
                        nodesToRemove.append(prevSibNode)
                        Parser.setAttribute(prevSibNode,
                                            attr='grv-usedalready',
                                            value='yes')
                        prev = Parser.previousSibling(prevSibNode)
                        prevSibNode = prev if prev is not None else None
                    # append replaceText
                    replacementText.append(replaceText)
                    #
                    nextSibNode = Parser.nextSibling(kidTextNode)
                    while nextSibNode is not None \
                        and Parser.getTag(nextSibNode) == "a" \
                        and Parser.getAttribute(nextSibNode, 'grv-usedalready') != 'yes':
                        outer = " " + Parser.outerHtml(nextSibNode) + " "
                        replacementText.append(outer)
                        nodesToRemove.append(nextSibNode)
                        Parser.setAttribute(nextSibNode,
                                            attr='grv-usedalready',
                                            value='yes')
                        next = Parser.nextSibling(nextSibNode)
                        prevSibNode = next if next is not None else None

            # otherwise
            else:
                nodesToReturn.append(kid)

        # flush out anything still remaining
        if (len(replacementText) > 0):
            newNode = self.getFlushedBuffer(''.join(replacementText), doc)
            nodesToReturn.append(newNode)
            replacementText = []

        #
        for n in nodesToRemove:
            Parser.remove(n)

        return nodesToReturn

Example #24

0

Show file

File: cleaners.py Project: evan0/python-goose

    def getReplacementNodes(self, doc, div):
        replacementText = []
        nodesToReturn = []
        nodesToRemove = []
        childs = Parser.childNodesWithText(div)

        for kid in childs:
            # node is a p
            # and already have some replacement text
            if Parser.getTag(kid) == "p" and len(replacementText) > 0:
                newNode = self.getFlushedBuffer("".join(replacementText), doc)
                nodesToReturn.append(newNode)
                replacementText = []
                nodesToReturn.append(kid)
            # node is a text node
            elif Parser.isTextNode(kid):
                kidTextNode = kid
                kidText = Parser.getText(kid)
                replaceText = self.tabsAndNewLinesReplcesments.replaceAll(kidText)
                if (len(replaceText)) > 1:
                    prevSibNode = Parser.previousSibling(kidTextNode)
                    while (
                        prevSibNode is not None
                        and Parser.getTag(prevSibNode) == "a"
                        and Parser.getAttribute(prevSibNode, "grv-usedalready") != "yes"
                    ):
                        outer = " " + Parser.outerHtml(prevSibNode) + " "
                        replacementText.append(outer)
                        nodesToRemove.append(prevSibNode)
                        Parser.setAttribute(prevSibNode, attr="grv-usedalready", value="yes")
                        prev = Parser.previousSibling(prevSibNode)
                        prevSibNode = prev if prev is not None else None
                    # append replaceText
                    replacementText.append(replaceText)
                    #
                    nextSibNode = Parser.nextSibling(kidTextNode)
                    while (
                        nextSibNode is not None
                        and Parser.getTag(nextSibNode) == "a"
                        and Parser.getAttribute(nextSibNode, "grv-usedalready") != "yes"
                    ):
                        outer = " " + Parser.outerHtml(nextSibNode) + " "
                        replacementText.append(outer)
                        nodesToRemove.append(nextSibNode)
                        Parser.setAttribute(nextSibNode, attr="grv-usedalready", value="yes")
                        next = Parser.nextSibling(nextSibNode)
                        prevSibNode = next if next is not None else None

            # otherwise
            else:
                nodesToReturn.append(kid)

        # flush out anything still remaining
        if len(replacementText) > 0:
            newNode = self.getFlushedBuffer("".join(replacementText), doc)
            nodesToReturn.append(newNode)
            replacementText = []

        for n in nodesToRemove:
            Parser.remove(n)

        return nodesToReturn