def _handle_unpaired_tag(self, html_tag): if self._read_bool_template_attribute(html_tag, "ignore") and html_tag.tag == "img": self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1)) elif self._read_bool_template_attribute(html_tag, "ignore-beneath"): self.ignored_regions.append((self.next_tag_index, None)) jannotation = self._read_template_annotation(html_tag) if jannotation: if self.unpairedtag_stack: self._close_unpaired_tag() annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1) attribute_annotations = jannotation.pop('annotations', {}).items() content_key = jannotation.pop('text-content', 'content') for extract_attribute, tag_value in attribute_annotations: if extract_attribute == content_key: annotation.surrounds_attribute = tag_value self.unpairedtag_stack.append(annotation) else: annotation.tag_attributes.append((extract_attribute, tag_value)) self.annotations.append(annotation) self.extra_required_attrs.extend(jannotation.pop('required', [])) variant_id = jannotation.pop('variant', 0) if variant_id > 0: annotation.variant_id = variant_id assert jannotation.pop("generated", False) == False annotation.metadata = jannotation self.next_tag_index += 1
def _handle_unpaired_tag(self, html_tag): if self._read_bool_template_attribute(html_tag, "ignore") and html_tag.tag == "img": self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1)) elif self._read_bool_template_attribute(html_tag, "ignore-beneath"): self.ignored_regions.append((self.next_tag_index, None)) jannotation = self._read_template_annotation(html_tag) if jannotation: if self.unpairedtag_stack: self._close_unpaired_tag() annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1) attribute_annotations = jannotation.pop('annotations', {}).items() content_key = jannotation.pop('text-content', 'content') for extract_attribute, tag_value in attribute_annotations: if extract_attribute == content_key: annotation.surrounds_attribute = tag_value self.unpairedtag_stack.append(annotation) else: annotation.tag_attributes.append((extract_attribute, tag_value)) self.annotations.append(annotation) self.extra_required_attrs.extend(jannotation.pop('required', [])) variant_id = jannotation.pop('variant', 0) if variant_id > 0: annotation.variant_id = variant_id assert jannotation.pop("generated", False) == False annotation.metadata = jannotation self.next_tag_index += 1
def build_annotation(self, jannotation, is_open=True): annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1) content_key = jannotation.pop('text-content', 'content') attribute_annotations = jannotation.pop('annotations', {}) content = attribute_annotations.pop(content_key, None) if is_open and content: annotation.surrounds_attribute = content annotation.tag_attributes = list(attribute_annotations.items()) annotation.metadata = jannotation return annotation
def build_annotation(self, jannotation, is_open=True): annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1) content_key = jannotation.pop('text-content', 'content') attribute_annotations = jannotation.pop('annotations', {}) content = attribute_annotations.pop(content_key, None) if is_open and content: annotation.surrounds_attribute = content annotation.tag_attributes = list(attribute_annotations.items()) annotation.metadata = jannotation return annotation
def _handle_open_tag(self, html_tag): if self._read_bool_template_attribute(html_tag, "ignore"): if html_tag.tag == "img": self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1)) else: self.ignored_regions.append((self.next_tag_index, None)) self.ignored_tag_stacks[html_tag.tag].append(html_tag) elif self.ignored_tag_stacks.get(html_tag.tag): self.ignored_tag_stacks[html_tag.tag].append(None) if self._read_bool_template_attribute(html_tag, "ignore-beneath"): self.ignored_regions.append((self.next_tag_index, None)) replacement = html_tag.attributes.pop("data-scrapy-replacement", None) if replacement: self.token_list.pop() self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end) self.replacement_stacks[html_tag.tag].append(replacement) elif html_tag.tag in self.replacement_stacks: self.replacement_stacks[html_tag.tag].append(None) if self.unpairedtag_stack: if html_tag.tag in _END_UNPAIREDTAG_TAGS: self._close_unpaired_tag() else: self.unpairedtag_stack.append(html_tag.tag) tagname = replacement or self._update_replacement_stack(html_tag) self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_OPEN) jannotation = self._read_template_annotation(html_tag) if not jannotation: if tagname in self.labelled_tag_stacks: # add this tag to the stack to match correct end tag self.labelled_tag_stacks[tagname].append(None) self.next_tag_index += 1 return annotation = AnnotationTag(self.next_tag_index, None) if jannotation.pop('generated', False): self.token_list.pop() annotation.start_index -= 1 if self.previous_element_class == HtmlTag: annotation.annotation_text = AnnotationText('') else: annotation.annotation_text = AnnotationText(self.prev_data) if self._read_bool_template_attribute(html_tag, "ignore") \ or self._read_bool_template_attribute(html_tag, "ignore-beneath"): ignored = self.ignored_regions.pop() self.ignored_regions.append((ignored[0]-1, ignored[1])) self.extra_required_attrs.extend(jannotation.pop('required', [])) attribute_annotations = jannotation.pop('annotations', {}).items() content_key = jannotation.pop('text-content', 'content') for extract_attribute, tag_value in attribute_annotations: if extract_attribute == content_key: annotation.surrounds_attribute = tag_value else: annotation.tag_attributes.append((extract_attribute, tag_value)) variant_id = jannotation.pop('variant', 0) if variant_id > 0: if annotation.surrounds_attribute is not None: self.variant_stack.append(variant_id) else: annotation.variant_id = variant_id annotation.metadata = jannotation if annotation.annotation_text is None: self.next_tag_index += 1 if self.variant_stack and annotation.variant_id is None: variant_id = self.variant_stack[-1] if variant_id == '0': variant_id = None annotation.variant_id = variant_id # look for a closing tag if the content is important if annotation.surrounds_attribute: self.labelled_tag_stacks[tagname].append(annotation) else: annotation.end_index = annotation.start_index + 1 self.annotations.append(annotation)
def _handle_open_tag(self, html_tag): if self._read_bool_template_attribute(html_tag, "ignore"): if html_tag.tag == "img": self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1)) else: self.ignored_regions.append((self.next_tag_index, None)) self.ignored_tag_stacks[html_tag.tag].append(html_tag) elif self.ignored_tag_stacks.get(html_tag.tag): self.ignored_tag_stacks[html_tag.tag].append(None) if self._read_bool_template_attribute(html_tag, "ignore-beneath"): self.ignored_regions.append((self.next_tag_index, None)) replacement = html_tag.attributes.pop("data-scrapy-replacement", None) if replacement: self.token_list.pop() self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end) self.replacement_stacks[html_tag.tag].append(replacement) elif html_tag.tag in self.replacement_stacks: self.replacement_stacks[html_tag.tag].append(None) if self.unpairedtag_stack: if html_tag.tag in _END_UNPAIREDTAG_TAGS: self._close_unpaired_tag() else: self.unpairedtag_stack.append(html_tag.tag) tagname = replacement or self._update_replacement_stack(html_tag) self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_OPEN) jannotation = self._read_template_annotation(html_tag) if not jannotation: if tagname in self.labelled_tag_stacks: # add this tag to the stack to match correct end tag self.labelled_tag_stacks[tagname].append(None) self.next_tag_index += 1 return annotation = AnnotationTag(self.next_tag_index, None) if jannotation.pop('generated', False): self.token_list.pop() annotation.start_index -= 1 if self.previous_element_class == HtmlTag: annotation.annotation_text = AnnotationText('') else: annotation.annotation_text = AnnotationText(self.prev_data) if self._read_bool_template_attribute(html_tag, "ignore") \ or self._read_bool_template_attribute(html_tag, "ignore-beneath"): ignored = self.ignored_regions.pop() self.ignored_regions.append((ignored[0]-1, ignored[1])) self.extra_required_attrs.extend(jannotation.pop('required', [])) attribute_annotations = jannotation.pop('annotations', {}).items() content_key = jannotation.pop('text-content', 'content') for extract_attribute, tag_value in attribute_annotations: if extract_attribute == content_key: annotation.surrounds_attribute = tag_value else: annotation.tag_attributes.append((extract_attribute, tag_value)) variant_id = jannotation.pop('variant', 0) if variant_id > 0: if annotation.surrounds_attribute is not None: self.variant_stack.append(variant_id) else: annotation.variant_id = variant_id annotation.metadata = jannotation if annotation.annotation_text is None: self.next_tag_index += 1 if self.variant_stack and annotation.variant_id is None: variant_id = self.variant_stack[-1] if variant_id == '0': variant_id = None annotation.variant_id = variant_id # look for a closing tag if the content is important if annotation.surrounds_attribute: self.labelled_tag_stacks[tagname].append(annotation) else: annotation.end_index = annotation.start_index + 1 self.annotations.append(annotation)