def load_annotations(body): """Create slybot annotations from annotated html.""" if not body: return {'annotations-plugin': {'extracts': []}} sel = Selector(text=add_tagids(body)) existing_ids = set() annotations = [] for elem in sel.xpath('//*[@data-scrapy-annotate]'): attributes = elem.root.attrib try: # Load annotation json and skip malformed json strings annotation = json.loads(unquote( attributes['data-scrapy-annotate'])) except ValueError: continue if (isinstance(elem.root, _Element) and elem.root.tag.lower() == 'ins'): annotation.update(find_generated_annotation(elem)) else: annotation['tagid'] = attributes.get('data-tagid') if 'id' not in annotation: annotation['id'] = gen_id(disallow=existing_ids) existing_ids.add(annotation['id']) annotations.append(annotation) for elem in sel.xpath('//*[@%s]' % '|@'.join(IGNORE_ATTRIBUTES)): attributes = elem.root.attrib for attribute in IGNORE_ATTRIBUTES: if attribute in attributes: break ignore = {attribute[len('data-scrapy-'):]: True} if 'id' not in ignore: ignore['id'] = gen_id(disallow=existing_ids) existing_ids.add(ignore['id']) annotations.append(ignore) return {'annotations-plugin': {'extracts': annotations}}
def load_annotations(body): """Create slybot annotations from annotated html.""" if not body: return {'annotations-plugin': {'extracts': []}} sel = Selector(text=add_tagids(body)) existing_ids = set() annotations = [] for elem in sel.xpath('//*[@data-scrapy-annotate]'): attributes = elem.root.attrib try: # Load annotation json and skip malformed json strings annotation = json.loads( unquote(attributes['data-scrapy-annotate'])) except ValueError: continue if (isinstance(elem.root, _Element) and elem.root.tag.lower() == 'ins'): annotation.update(find_generated_annotation(elem)) else: annotation['tagid'] = attributes.get('data-tagid') if 'id' not in annotation: annotation['id'] = gen_id(disallow=existing_ids) existing_ids.add(annotation['id']) annotations.append(annotation) for elem in sel.xpath('//*[@%s]' % '|@'.join(IGNORE_ATTRIBUTES)): attributes = elem.root.attrib for attribute in IGNORE_ATTRIBUTES: if attribute in attributes: break ignore = {attribute[len('data-scrapy-'):]: True} if 'id' not in ignore: ignore['id'] = gen_id(disallow=existing_ids) existing_ids.add(ignore['id']) annotations.append(ignore) return {'annotations-plugin': {'extracts': annotations}}
def html4annotation(htmlpage, baseurl=None, proxy_resources=None): """Convert the given html document for the annotation UI This adds tags, removes scripts and optionally adds a base url """ htmlpage = add_tagids(htmlpage) cleaned_html = descriptify(htmlpage, baseurl, proxy=proxy_resources) return cleaned_html
def apply_annotations(annotations, target_page, legacy=False): selector_annotations, tagid_annotations = _filter_annotations(annotations) inserts = defaultdict(list) numbered_html = add_tagids(target_page) if selector_annotations: converted_annotations = apply_selector_annotations( selector_annotations, numbered_html) tagid_annotations += converted_annotations target = iter(parse_html(numbered_html)) output, tag_stack = [], [] element = next(target) last_id = 0 # XXX: A dummy element is added to the end so if the last annotation is # generated it will be added to the output filtered = defaultdict(list) for grouped in tagid_annotations: for ann in arg_to_iter(grouped): filtered[ann['tagid']].append(ann) dummy = [(1e9, [{}])] sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()] + dummy) try: for aid, annotation_data in sorted_annotations: # Move target until replacement/insertion point while True: while not isinstance(element, HtmlTag) or element.tag == 'ins': output.append(numbered_html[element.start:element.end]) element = next(target) if element.tag_type in {OPEN_TAG, UNPAIRED_TAG}: last_id = element.attributes.get(TAGID) tag_stack.append(last_id) if element.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and tag_stack: if ('__added' not in element.attributes and last_id is not None and aid is not None and int(last_id) < int(aid)): output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True last_inserted = tag_stack.pop() to_insert = inserts.pop(last_inserted, None) if to_insert: output.extend(to_insert) # Skip all nodes up to the next HtmlTag as these # have already been added while True: element = next(target) try: last_id = element.attributes.get(TAGID, last_id) except AttributeError: pass if isinstance(element, HtmlTag): break continue if (last_id is not None and aid is not None and int(last_id) < int(aid)): if '__added' not in element.attributes: output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True element = next(target) else: break generated = [] next_generated = [] regular_annotations = [] # Place generated annotations at the end and sort by slice for annotation in sorted(annotation_data, key=_annotation_key): if annotation.get('generated'): if annotation.get('insert_after'): next_generated.append(annotation) else: generated.append(annotation) else: regular_annotations.append(annotation) # Add annotations data as required if regular_annotations: annotation_info = _gen_annotation_info(regular_annotations, legacy) for key, val in annotation_info.items(): element.attributes[key] = val next_text_section = '' if generated: inner_data, target = tee(target) nodes = _get_inner_nodes(inner_data) next_text_section = _get_generated_annotation( element, generated, nodes, numbered_html, inserts, legacy) if next_generated: inner_data, target = tee(target) open_tags = 0 if element.tag_type == UNPAIRED_TAG else 1 nodes = _get_inner_nodes(inner_data, open_tags=open_tags, insert_after=True) next_text_section = _get_generated_annotation( element, next_generated, nodes, numbered_html, inserts, legacy) if '__added' not in element.attributes: output.append(serialize_tag(element)) element.attributes['__added'] = True # If an <ins> tag has been inserted we need to move forward if next_text_section: while True: elem = next(target) if (isinstance(elem, HtmlDataFragment) and elem.is_text_content): break output.append(numbered_html[elem.start:elem.end]) output.append(next_text_section) # Reached the end of the document except StopIteration: output.append(numbered_html[element.start:element.end]) else: for element in target: output.append(numbered_html[element.start:element.end]) return remove_tagids(''.join(output))
def numbered_html(self): if hasattr(self, '_numbered_html'): return self._numbered_html self._numbered_html = add_tagids(self.html) return self._numbered_html
def port_sample(sample, schemas=None, extractors=None): """Convert slybot samples made before slybot 0.13 to new format.""" if schemas is None: schemas = {} if extractors is None: extractors = {} if sample.get('version') == SLYBOT_VERSION: return sample, schemas container_id = gen_predictable_id(sample.get('id', 1), sample['page_id']) schema_id, schemas = guess_schema(sample, schemas) default_annotations = [ _create_container('body', container_id, schema_id=schema_id) ] if not sample.get('annotated_body') and not sample.get('plugins'): sample['plugins'] = { 'annotations-plugin': { 'extracts': default_annotations } } return sample, schemas if not sample.get('plugins'): sample['plugins'] = load_annotations(sample.get('annotated_body', u'')) else: repair_ids(sample) # Group annotations by type annotations = sample['plugins']['annotations-plugin']['extracts'] try: sel = Selector(text=add_tagids(sample['original_body'])) except KeyError: annotated = sample.get('annotated_body', u'') sample['original_body'] = annotated try: tagged = add_tagids(annotated) except KeyError: tagged = u'' sel = Selector(text=tagged) sample.pop('annotated_body', None) annotations = port_standard(annotations, sel, sample, extractors) standard_annos, generated_annos, variant_annos = [], [], [] for a in annotations: if a.get('generated'): generated_annos.append(a) elif a.get('variants', 0) > 0: variant_annos.append(a) else: standard_annos.append(a) if not annotations: sample['plugins'] = { 'annotations-plugin': { 'extracts': default_annotations } } return sample, schemas new_annotations = [] a = find_element(annotations[0], sel) for b in annotations[1:]: b = find_element(b, sel) a = find_common_parent(a, b) parent = a.getparent() container = _create_container(a if parent is None else parent, container_id, selector=sel, schema_id=schema_id) new_annotations.append(container) for a in standard_annos: a.pop('variant', None) new_annotations.extend(standard_annos) new_annotations.extend(port_generated(generated_annos, sel)) new_annotations.extend(port_variants(variant_annos, sel)) for a in new_annotations: if not (a.get('item_container') and a.get('container_id')): if container_id == a.get('id'): continue a['container_id'] = container_id a.pop('tagid', None) or a.pop('data-tagid', None) # Update annotations sample['plugins']['annotations-plugin']['extracts'] = new_annotations sample['version'] = SLYBOT_VERSION return sample, schemas
def port_sample(sample, schemas=None, extractors=None): """Convert slybot samples made before slybot 0.13 to new format.""" if schemas is None: schemas = {} if extractors is None: extractors = {} if sample.get('version') == SLYBOT_VERSION: return sample, schemas if 'url' not in sample: sample['url'] = 'http://example.com' container_id = gen_predictable_id( sample.get('id', 1), sample.get('page_id', sample['name'])) schema_id, schemas = guess_schema(sample, schemas) default_annotations = [_create_container('body', container_id, schema_id=schema_id)] if not sample.get('annotated_body') and not sample.get('plugins'): sample['plugins'] = { 'annotations-plugin': { 'extracts': default_annotations } } return sample, schemas if not sample.get('plugins'): sample['plugins'] = load_annotations(sample.get('annotated_body', u'')) else: repair_ids(sample) # Group annotations by type annotations = sample['plugins']['annotations-plugin']['extracts'] try: sel = Selector(text=add_tagids(sample['original_body'])) except KeyError: annotated = sample.get('annotated_body', u'') sample['original_body'] = annotated try: tagged = add_tagids(annotated) except KeyError: tagged = u'' sel = Selector(text=tagged) sample.pop('annotated_body', None) annotations = port_standard(annotations, sel, sample, extractors) standard_annos, generated_annos, variant_annos = [], [], [] for a in annotations: if a.get('generated'): generated_annos.append(a) elif a.get('variants', 0) > 0: variant_annos.append(a) else: standard_annos.append(a) if not annotations: sample['plugins'] = { 'annotations-plugin': { 'extracts': default_annotations } } return sample, schemas new_annotations = [] a = find_element(annotations[0], sel) for b in annotations[1:]: b = find_element(b, sel) a = find_common_parent(a, b) parent = a.getparent() container = _create_container( a if parent is None else parent, container_id, selector=sel, schema_id=schema_id) new_annotations.append(container) for a in standard_annos: a.pop('variant', None) new_annotations.extend(standard_annos) new_annotations.extend(port_generated(generated_annos, sel)) new_annotations.extend(port_variants(variant_annos, sel)) for a in new_annotations: if not (a.get('item_container') and a.get('container_id')): if container_id == a.get('id'): continue a['container_id'] = container_id a.pop('tagid', None) or a.pop('data-tagid', None) # Update annotations sample['plugins']['annotations-plugin']['extracts'] = new_annotations sample['version'] = SLYBOT_VERSION return sample, schemas