Exemple #1
0
    def _handle_unpaired_tag(self, html_tag):
        if self._read_bool_template_attribute(html_tag, "ignore") and html_tag.tag == "img":
            self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1))
        elif self._read_bool_template_attribute(html_tag, "ignore-beneath") and html_tag.tag == "img":
            self.ignored_regions.append((self.next_tag_index, None))
        jannotation = self._read_template_annotation(html_tag)
        if jannotation:
            if self.unpairedtag_stack:
                self._close_unpaired_tag()
                
            annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1)
            attribute_annotations = jannotation.pop('annotations', {}).items()
            for extract_attribute, tag_value in attribute_annotations:
                if extract_attribute == 'content':
                    annotation.surrounds_attribute = tag_value
                    self.unpairedtag_stack.append(annotation)
                else:
                    annotation.tag_attributes.append((extract_attribute, tag_value))
            self.annotations.append(annotation)
            if jannotation.pop('common_prefix', False):
                annotation.match_common_prefix = True

            self.extra_required_attrs.extend(jannotation.pop('required', []))
            annotation.metadata = jannotation

        self.next_tag_index += 1
Exemple #2
0
    def _handle_unpaired_tag(self, html_tag):
        if self._read_bool_template_attribute(
                html_tag, "ignore") and html_tag.tag == "img":
            self.ignored_regions.append(
                (self.next_tag_index, self.next_tag_index + 1))
        elif self._read_bool_template_attribute(html_tag, "ignore-beneath"):
            self.ignored_regions.append((self.next_tag_index, None))
        jannotation = self._read_template_annotation(html_tag)
        if jannotation:
            if self.unpairedtag_stack:
                self._close_unpaired_tag()

            annotation = AnnotationTag(self.next_tag_index,
                                       self.next_tag_index + 1)
            attribute_annotations = jannotation.pop('annotations', {}).items()
            for extract_attribute, tag_value in attribute_annotations:
                if extract_attribute == 'content':
                    annotation.surrounds_attribute = tag_value
                    self.unpairedtag_stack.append(annotation)
                else:
                    annotation.tag_attributes.append(
                        (extract_attribute, tag_value))
            self.annotations.append(annotation)

            self.extra_required_attrs.extend(jannotation.pop('required', []))
            annotation.metadata = jannotation

        self.next_tag_index += 1
 def __init__(self, extractors, template_tokens):
     """Construct a RecordExtractor for the given annotations and their
     corresponding region extractors
     """
     self.extractors = extractors
     self.template_tokens = template_tokens
     self.template_ignored_regions = []
     start_index = min(e.annotation.start_index for e in extractors)
     end_index = max(e.annotation.end_index for e in extractors)
     self.annotation = AnnotationTag(start_index, end_index)
Exemple #4
0
    def _handle_open_tag(self, html_tag):
        if self._read_bool_template_attribute(html_tag, "ignore"):
            if html_tag.tag == "img":
                self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1))
            else:
                self.ignored_regions.append((self.next_tag_index, None))
                self.ignored_tag_stacks[html_tag.tag].append(html_tag)
                
        elif self.ignored_tag_stacks.get(html_tag.tag):
            self.ignored_tag_stacks[html_tag.tag].append(None)
        if self._read_bool_template_attribute(html_tag, "ignore-beneath"):
            self.ignored_regions.append((self.next_tag_index, None))
        
        replacement = html_tag.attributes.pop("data-scrapy-replacement", None)
        if replacement:
            self.token_list.pop()
            self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end)
            self.replacement_stacks[html_tag.tag].append(replacement)
        elif html_tag.tag in self.replacement_stacks:
            self.replacement_stacks[html_tag.tag].append(None)

        if self.unpairedtag_stack:
            if html_tag.tag in _END_UNPAIREDTAG_TAGS:
                self._close_unpaired_tag()
            else:
                self.unpairedtag_stack.append(html_tag.tag)
            
        # can't be a p inside another p. Also, an open p element closes
        # a previous open p element.
        if html_tag.tag == "p" and html_tag.tag in self.labelled_tag_stacks:
            annotation = self.labelled_tag_stacks.pop(html_tag.tag)[0]
            annotation.end_index = self.next_tag_index
            self.annotations.append(annotation)
                
        jannotation = self._read_template_annotation(html_tag)
        if not jannotation:
            if html_tag.tag in self.labelled_tag_stacks:
                # add this tag to the stack to match correct end tag
                self.labelled_tag_stacks[html_tag.tag].append(None)
            self.next_tag_index += 1
            return
        
        annotation = AnnotationTag(self.next_tag_index, None)
        if jannotation.pop('generated', False):
            self.token_list.pop()
            annotation.start_index -= 1
            if self.previous_element_class == HtmlTag:
                annotation.annotation_text = AnnotationText('')
            else:
                annotation.annotation_text = AnnotationText(self.prev_data)
            if self._read_bool_template_attribute(html_tag, "ignore") \
                    or self._read_bool_template_attribute(html_tag, "ignore-beneath"):
                ignored = self.ignored_regions.pop()
                self.ignored_regions.append((ignored[0]-1, ignored[1]))
                
        self.extra_required_attrs.extend(jannotation.pop('required', []))
        
        attribute_annotations = jannotation.pop('annotations', {}).items()
        for extract_attribute, tag_value in attribute_annotations:
            if extract_attribute == 'content':
                annotation.surrounds_attribute = tag_value
            else:
                annotation.tag_attributes.append((extract_attribute, tag_value))
 
        variant_id = jannotation.pop('variant', 0)
        if variant_id > 0:
            if annotation.surrounds_attribute is not None:
                self.variant_stack.append(variant_id)
            else:
                annotation.variant_id = variant_id
       
        annotation.metadata = jannotation

        if annotation.annotation_text is None:
            self.next_tag_index += 1
        if self.variant_stack and annotation.variant_id is None:
            variant_id = self.variant_stack[-1]
            if variant_id == '0':
                variant_id = None
            annotation.variant_id = variant_id
        
        # look for a closing tag if the content is important
        if annotation.surrounds_attribute:
            self.labelled_tag_stacks[html_tag.tag].append(annotation)
        else:
            annotation.end_index = annotation.start_index + 1
            self.annotations.append(annotation)
Exemple #5
0
    def _handle_open_tag(self, html_tag):
        if self._read_bool_template_attribute(html_tag, "ignore"):
            if html_tag.tag == "img":
                self.ignored_regions.append(
                    (self.next_tag_index, self.next_tag_index + 1))
            else:
                self.ignored_regions.append((self.next_tag_index, None))
                self.ignored_tag_stacks[html_tag.tag].append(html_tag)

        elif self.ignored_tag_stacks.get(html_tag.tag):
            self.ignored_tag_stacks[html_tag.tag].append(None)
        if self._read_bool_template_attribute(html_tag, "ignore-beneath"):
            self.ignored_regions.append((self.next_tag_index, None))

        replacement = html_tag.attributes.pop("data-scrapy-replacement", None)
        if replacement:
            self.token_list.pop()
            self._add_token(replacement, html_tag.tag_type, html_tag.start,
                            html_tag.end)
            self.replacement_stacks[html_tag.tag].append(replacement)
        elif html_tag.tag in self.replacement_stacks:
            self.replacement_stacks[html_tag.tag].append(None)

        if self.unpairedtag_stack:
            if html_tag.tag in _END_UNPAIREDTAG_TAGS:
                self._close_unpaired_tag()
            else:
                self.unpairedtag_stack.append(html_tag.tag)

        # can't be a p inside another p. Also, an open p element closes
        # a previous open p element.
        if html_tag.tag == "p" and html_tag.tag in self.labelled_tag_stacks:
            annotation = self.labelled_tag_stacks.pop(html_tag.tag)[0]
            annotation.end_index = self.next_tag_index
            self.annotations.append(annotation)

        jannotation = self._read_template_annotation(html_tag)
        if not jannotation:
            if html_tag.tag in self.labelled_tag_stacks:
                # add this tag to the stack to match correct end tag
                self.labelled_tag_stacks[html_tag.tag].append(None)
            self.next_tag_index += 1
            return

        annotation = AnnotationTag(self.next_tag_index, None)
        if jannotation.pop('generated', False):
            self.token_list.pop()
            annotation.start_index -= 1
            if self.previous_element_class == HtmlTag:
                annotation.annotation_text = AnnotationText('')
            else:
                annotation.annotation_text = AnnotationText(self.prev_data)
            if self._read_bool_template_attribute(html_tag, "ignore") \
                    or self._read_bool_template_attribute(html_tag, "ignore-beneath"):
                ignored = self.ignored_regions.pop()
                self.ignored_regions.append((ignored[0] - 1, ignored[1]))

        self.extra_required_attrs.extend(jannotation.pop('required', []))

        attribute_annotations = jannotation.pop('annotations', {}).items()
        for extract_attribute, tag_value in attribute_annotations:
            if extract_attribute == 'content':
                annotation.surrounds_attribute = tag_value
            else:
                annotation.tag_attributes.append(
                    (extract_attribute, tag_value))

        variant_id = jannotation.pop('variant', 0)
        if variant_id > 0:
            if annotation.surrounds_attribute is not None:
                self.variant_stack.append(variant_id)
            else:
                annotation.variant_id = variant_id

        annotation.metadata = jannotation

        if annotation.annotation_text is None:
            self.next_tag_index += 1
        if self.variant_stack and annotation.variant_id is None:
            variant_id = self.variant_stack[-1]
            if variant_id == '0':
                variant_id = None
            annotation.variant_id = variant_id

        # look for a closing tag if the content is important
        if annotation.surrounds_attribute:
            self.labelled_tag_stacks[html_tag.tag].append(annotation)
        else:
            annotation.end_index = annotation.start_index + 1
            self.annotations.append(annotation)