def test_annotation(self): html_page = HtmlPage(body=TEST_PAGE) template = { 'original_body': html_page.body } data = { 'extracts': [ { 'annotations': {'href': 'origin'}, 'id': 'test-id-123', 'required': [], 'tagid': 123, 'variant': 0 } ] } annotations = Annotations() annotations.save_extraction_data(data, template) sample = HtmlPage(body=add_tagids(template['annotated_body'])) for element in sample.parsed_body: if isinstance(element, HtmlTag): tagid = element.attributes.get(TAGID, None) if tagid and int(tagid) == data['extracts'][0]['tagid']: annotation = element.attributes.get('data-scrapy-annotate') self.assertTrue(annotation) self.assertTrue('"id": "test-id-123"')
def apply_annotations(annotations, target_page): inserts = defaultdict(list) numbered_html = add_tagids(target_page) target = parse_html(numbered_html) output, tag_stack = [], [] element = target.next() last_id = 0 # XXX: A dummy element is added to the end so if the last annotation is # generated it will be added to the output filtered = defaultdict(list) for ann in annotations: if ann and ann.get('tagid') and (ann.get('annotations') or ann.get('ignore')): filtered[ann['tagid']].append(ann) dummy = [(1e9, [{}])] sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()] + dummy) try: for aid, annotation_data in sorted_annotations: # Move target until replacement/insertion point while True: while not isinstance(element, HtmlTag): output.append(numbered_html[element.start:element.end]) element = target.next() if element.tag_type in {OPEN_TAG, UNPAIRED_TAG}: last_id = element.attributes.get(TAGID) tag_stack.append(last_id) if element.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and tag_stack: if ('__added' not in element.attributes and int(last_id) < int(aid)): output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True last_inserted = tag_stack.pop() to_insert = inserts.pop(last_inserted, None) if to_insert: output.extend(to_insert) # Skip all nodes up to the next HtmlTag as these # have already been added while True: element = target.next() try: last_id = element.attributes.get(TAGID, last_id) except AttributeError: pass if isinstance(element, HtmlTag): break continue if last_id is not None and int(last_id) < int(aid): if '__added' not in element.attributes: output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True element = target.next() else: break generated = [] next_generated = [] # Place generated annotations at the end and sort by slice for annotation in sorted(annotation_data, key=_annotation_key): if annotation.get('generated'): if annotation.get('insert_after'): next_generated.append(annotation) else: generated.append(annotation) else: # Add annotations data as required annotation_info = _gen_annotation_info(annotation) for key, val in annotation_info.items(): element.attributes[key] = val next_text_section = '' if generated: inner_data, target = tee(target) nodes = _get_inner_nodes(inner_data) next_text_section = _get_generated_annotation( element, generated, nodes, numbered_html, inserts) if next_generated: inner_data, target = tee(target) open_tags = 0 if element.tag_type == UNPAIRED_TAG else 1 nodes = _get_inner_nodes(inner_data, open_tags=open_tags, insert_after=True) next_text_section = _get_generated_annotation( element, next_generated, nodes, numbered_html, inserts) if '__added' not in element.attributes: output.append(serialize_tag(element)) element.attributes['__added'] = True # If an <ins> tag has been inserted we need to move forward if next_text_section: while True: elem = target.next() if (isinstance(elem, HtmlDataFragment) and elem.is_text_content): break output.append(numbered_html[elem.start:elem.end]) output.append(next_text_section) # Reached the end of the document except StopIteration: output.append(numbered_html[element.start:element.end]) else: for element in target: output.append(numbered_html[element.start:element.end]) return remove_tagids(''.join(output))
def apply_annotations(annotations, target_page): inserts = defaultdict(list) numbered_html = add_tagids(target_page) target = parse_html(numbered_html) output, tag_stack = [], [] element = next(target) last_id = 0 # XXX: A dummy element is added to the end so if the last annotation is # generated it will be added to the output filtered = defaultdict(list) for ann in annotations: if ann and ann.get('tagid') and (ann.get('annotations') or ann.get('ignore')): filtered[ann['tagid']].append(ann) dummy = [(1e9, [{}])] sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()] + dummy) try: for aid, annotation_data in sorted_annotations: # Move target until replacement/insertion point while True: while not isinstance(element, HtmlTag): output.append(numbered_html[element.start:element.end]) element = next(target) if element.tag_type in {OPEN_TAG, UNPAIRED_TAG}: last_id = element.attributes.get(TAGID) tag_stack.append(last_id) if element.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and tag_stack: if ('__added' not in element.attributes and int(last_id) < int(aid)): output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True last_inserted = tag_stack.pop() to_insert = inserts.pop(last_inserted, None) if to_insert: output.extend(to_insert) # Skip all nodes up to the next HtmlTag as these # have already been added while True: element = next(target) try: last_id = element.attributes.get(TAGID, last_id) except AttributeError: pass if isinstance(element, HtmlTag): break continue if last_id is not None and int(last_id) < int(aid): if '__added' not in element.attributes: output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True element = next(target) else: break generated = [] next_generated = [] # Place generated annotations at the end and sort by slice for annotation in sorted(annotation_data, key=_annotation_key): if annotation.get('generated'): if annotation.get('insert_after'): next_generated.append(annotation) else: generated.append(annotation) else: # Add annotations data as required annotation_info = _gen_annotation_info(annotation) for key, val in annotation_info.items(): element.attributes[key] = val next_text_section = '' if generated: inner_data, target = tee(target) nodes = _get_inner_nodes(inner_data) next_text_section = _get_generated_annotation( element, generated, nodes, numbered_html, inserts) if next_generated: inner_data, target = tee(target) open_tags = 0 if element.tag_type == UNPAIRED_TAG else 1 nodes = _get_inner_nodes(inner_data, open_tags=open_tags, insert_after=True) next_text_section = _get_generated_annotation( element, next_generated, nodes, numbered_html, inserts) if '__added' not in element.attributes: output.append(serialize_tag(element)) element.attributes['__added'] = True # If an <ins> tag has been inserted we need to move forward if next_text_section: while True: elem = next(target) if (isinstance(elem, HtmlDataFragment) and elem.is_text_content): break output.append(numbered_html[elem.start:elem.end]) output.append(next_text_section) # Reached the end of the document except StopIteration: output.append(numbered_html[element.start:element.end]) else: for element in target: output.append(numbered_html[element.start:element.end]) return remove_tagids(''.join(output))