Python AnnotationTag.surrounds_attribute Exemples, scrapely.extraction.pageobjects.AnnotationTag.surrounds_attribute Python Exemples

Exemple #1

0

Afficher le fichier

    def _handle_unpaired_tag(self, html_tag):
        if self._read_bool_template_attribute(html_tag, "ignore") and html_tag.tag == "img":
            self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1))
        elif self._read_bool_template_attribute(html_tag, "ignore-beneath"):
            self.ignored_regions.append((self.next_tag_index, None))
        jannotation = self._read_template_annotation(html_tag)
        if jannotation:
            if self.unpairedtag_stack:
                self._close_unpaired_tag()

            annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1)
            attribute_annotations = jannotation.pop('annotations', {}).items()
            content_key = jannotation.pop('text-content', 'content')
            for extract_attribute, tag_value in attribute_annotations:
                if extract_attribute == content_key:
                    annotation.surrounds_attribute = tag_value
                    self.unpairedtag_stack.append(annotation)
                else:
                    annotation.tag_attributes.append((extract_attribute, tag_value))
            self.annotations.append(annotation)

            self.extra_required_attrs.extend(jannotation.pop('required', []))
            variant_id = jannotation.pop('variant', 0)
            if variant_id > 0:
                annotation.variant_id = variant_id
            assert jannotation.pop("generated", False) == False
            annotation.metadata = jannotation

        self.next_tag_index += 1

Exemple #2

0

Afficher le fichier

Fichier : pageparsing.py Projet : cgc1983/scrapely

    def _handle_unpaired_tag(self, html_tag):
        if self._read_bool_template_attribute(html_tag, "ignore") and html_tag.tag == "img":
            self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1))
        elif self._read_bool_template_attribute(html_tag, "ignore-beneath"):
            self.ignored_regions.append((self.next_tag_index, None))
        jannotation = self._read_template_annotation(html_tag)
        if jannotation:
            if self.unpairedtag_stack:
                self._close_unpaired_tag()

            annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1)
            attribute_annotations = jannotation.pop('annotations', {}).items()
            content_key = jannotation.pop('text-content', 'content')
            for extract_attribute, tag_value in attribute_annotations:
                if extract_attribute == content_key:
                    annotation.surrounds_attribute = tag_value
                    self.unpairedtag_stack.append(annotation)
                else:
                    annotation.tag_attributes.append((extract_attribute, tag_value))
            self.annotations.append(annotation)

            self.extra_required_attrs.extend(jannotation.pop('required', []))
            variant_id = jannotation.pop('variant', 0)
            if variant_id > 0:
                annotation.variant_id = variant_id
            assert jannotation.pop("generated", False) == False
            annotation.metadata = jannotation

        self.next_tag_index += 1

Exemple #3

0

Afficher le fichier

Fichier : pageparsing.py Projet : daqv/portia-dashboard

 def build_annotation(self, jannotation, is_open=True):
     annotation = AnnotationTag(self.next_tag_index,
                                self.next_tag_index + 1)
     content_key = jannotation.pop('text-content', 'content')
     attribute_annotations = jannotation.pop('annotations', {})
     content = attribute_annotations.pop(content_key, None)
     if is_open and content:
         annotation.surrounds_attribute = content
     annotation.tag_attributes = list(attribute_annotations.items())
     annotation.metadata = jannotation
     return annotation

Exemple #4

0

Afficher le fichier

 def build_annotation(self, jannotation, is_open=True):
     annotation = AnnotationTag(self.next_tag_index,
                                self.next_tag_index + 1)
     content_key = jannotation.pop('text-content', 'content')
     attribute_annotations = jannotation.pop('annotations', {})
     content = attribute_annotations.pop(content_key, None)
     if is_open and content:
         annotation.surrounds_attribute = content
     annotation.tag_attributes = list(attribute_annotations.items())
     annotation.metadata = jannotation
     return annotation

Exemple #5

0

Afficher le fichier

    def _handle_open_tag(self, html_tag):
        if self._read_bool_template_attribute(html_tag, "ignore"):
            if html_tag.tag == "img":
                self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1))
            else:
                self.ignored_regions.append((self.next_tag_index, None))
                self.ignored_tag_stacks[html_tag.tag].append(html_tag)

        elif self.ignored_tag_stacks.get(html_tag.tag):
            self.ignored_tag_stacks[html_tag.tag].append(None)
        if self._read_bool_template_attribute(html_tag, "ignore-beneath"):
            self.ignored_regions.append((self.next_tag_index, None))

        replacement = html_tag.attributes.pop("data-scrapy-replacement", None)
        if replacement:
            self.token_list.pop()
            self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end)
            self.replacement_stacks[html_tag.tag].append(replacement)
        elif html_tag.tag in self.replacement_stacks:
            self.replacement_stacks[html_tag.tag].append(None)

        if self.unpairedtag_stack:
            if html_tag.tag in _END_UNPAIREDTAG_TAGS:
                self._close_unpaired_tag()
            else:
                self.unpairedtag_stack.append(html_tag.tag)

        tagname = replacement or self._update_replacement_stack(html_tag)
        self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_OPEN)

        jannotation = self._read_template_annotation(html_tag)
        if not jannotation:
            if tagname in self.labelled_tag_stacks:
                # add this tag to the stack to match correct end tag
                self.labelled_tag_stacks[tagname].append(None)
            self.next_tag_index += 1
            return

        annotation = AnnotationTag(self.next_tag_index, None)
        if jannotation.pop('generated', False):
            self.token_list.pop()
            annotation.start_index -= 1
            if self.previous_element_class == HtmlTag:
                annotation.annotation_text = AnnotationText('')
            else:
                annotation.annotation_text = AnnotationText(self.prev_data)
            if self._read_bool_template_attribute(html_tag, "ignore") \
                    or self._read_bool_template_attribute(html_tag, "ignore-beneath"):
                ignored = self.ignored_regions.pop()
                self.ignored_regions.append((ignored[0]-1, ignored[1]))

        self.extra_required_attrs.extend(jannotation.pop('required', []))

        attribute_annotations = jannotation.pop('annotations', {}).items()
        content_key = jannotation.pop('text-content', 'content')
        for extract_attribute, tag_value in attribute_annotations:
            if extract_attribute == content_key:
                annotation.surrounds_attribute = tag_value
            else:
                annotation.tag_attributes.append((extract_attribute, tag_value))

        variant_id = jannotation.pop('variant', 0)
        if variant_id > 0:
            if annotation.surrounds_attribute is not None:
                self.variant_stack.append(variant_id)
            else:
                annotation.variant_id = variant_id

        annotation.metadata = jannotation

        if annotation.annotation_text is None:
            self.next_tag_index += 1
        if self.variant_stack and annotation.variant_id is None:
            variant_id = self.variant_stack[-1]
            if variant_id == '0':
                variant_id = None
            annotation.variant_id = variant_id

        # look for a closing tag if the content is important
        if annotation.surrounds_attribute:
            self.labelled_tag_stacks[tagname].append(annotation)
        else:
            annotation.end_index = annotation.start_index + 1
            self.annotations.append(annotation)

Exemple #6

0

Afficher le fichier

Fichier : pageparsing.py Projet : cgc1983/scrapely

    def _handle_open_tag(self, html_tag):
        if self._read_bool_template_attribute(html_tag, "ignore"):
            if html_tag.tag == "img":
                self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1))
            else:
                self.ignored_regions.append((self.next_tag_index, None))
                self.ignored_tag_stacks[html_tag.tag].append(html_tag)

        elif self.ignored_tag_stacks.get(html_tag.tag):
            self.ignored_tag_stacks[html_tag.tag].append(None)
        if self._read_bool_template_attribute(html_tag, "ignore-beneath"):
            self.ignored_regions.append((self.next_tag_index, None))

        replacement = html_tag.attributes.pop("data-scrapy-replacement", None)
        if replacement:
            self.token_list.pop()
            self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end)
            self.replacement_stacks[html_tag.tag].append(replacement)
        elif html_tag.tag in self.replacement_stacks:
            self.replacement_stacks[html_tag.tag].append(None)

        if self.unpairedtag_stack:
            if html_tag.tag in _END_UNPAIREDTAG_TAGS:
                self._close_unpaired_tag()
            else:
                self.unpairedtag_stack.append(html_tag.tag)

        tagname = replacement or self._update_replacement_stack(html_tag)
        self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_OPEN)

        jannotation = self._read_template_annotation(html_tag)
        if not jannotation:
            if tagname in self.labelled_tag_stacks:
                # add this tag to the stack to match correct end tag
                self.labelled_tag_stacks[tagname].append(None)
            self.next_tag_index += 1
            return

        annotation = AnnotationTag(self.next_tag_index, None)
        if jannotation.pop('generated', False):
            self.token_list.pop()
            annotation.start_index -= 1
            if self.previous_element_class == HtmlTag:
                annotation.annotation_text = AnnotationText('')
            else:
                annotation.annotation_text = AnnotationText(self.prev_data)
            if self._read_bool_template_attribute(html_tag, "ignore") \
                    or self._read_bool_template_attribute(html_tag, "ignore-beneath"):
                ignored = self.ignored_regions.pop()
                self.ignored_regions.append((ignored[0]-1, ignored[1]))

        self.extra_required_attrs.extend(jannotation.pop('required', []))

        attribute_annotations = jannotation.pop('annotations', {}).items()
        content_key = jannotation.pop('text-content', 'content')
        for extract_attribute, tag_value in attribute_annotations:
            if extract_attribute == content_key:
                annotation.surrounds_attribute = tag_value
            else:
                annotation.tag_attributes.append((extract_attribute, tag_value))

        variant_id = jannotation.pop('variant', 0)
        if variant_id > 0:
            if annotation.surrounds_attribute is not None:
                self.variant_stack.append(variant_id)
            else:
                annotation.variant_id = variant_id

        annotation.metadata = jannotation

        if annotation.annotation_text is None:
            self.next_tag_index += 1
        if self.variant_stack and annotation.variant_id is None:
            variant_id = self.variant_stack[-1]
            if variant_id == '0':
                variant_id = None
            annotation.variant_id = variant_id

        # look for a closing tag if the content is important
        if annotation.surrounds_attribute:
            self.labelled_tag_stacks[tagname].append(annotation)
        else:
            annotation.end_index = annotation.start_index + 1
            self.annotations.append(annotation)