def port_sample(sample): """Convert slybot samples made before slybot 0.13 to new format.""" if not sample.get('annotated_body'): if not sample.get('plugins'): sample['plugins'] = {'annotations-plugin': {'extracts': []}} return sample # Handle empty body if not sample.get('plugins'): sample['plugins'] = load_annotations(sample.get('annotated_body', u'')) del sample['annotated_body'] # Group annotations by type annotations = sample['plugins']['annotations-plugin']['extracts'] try: sel = Selector(text=add_tagids(sample['original_body'])) except KeyError: annotated = sample['annotated_body'] sample['original_body'] = annotated sel = Selector(text=add_tagids(annotated)) annotations = port_standard(annotations, sel, sample) standard_annos, generated_annos, variant_annos = [], [], [] for a in annotations: if a.get('generated'): generated_annos.append(a) elif a.get('variants', 0) > 0: variant_annos.append(a) else: standard_annos.append(a) if not annotations: return sample new_annotations = [] a = find_element(annotations[0], sel) for b in annotations[1:]: b = find_element(b, sel) a = find_common_parent(a, b) container_id = gen_id() parent = a.getparent() new_annotations.append( _create_container(a if parent is None else parent, container_id, selector=sel)) for a in standard_annos: a.pop('variant', None) new_annotations.extend(standard_annos) new_annotations.extend(port_generated(generated_annos, sel)) new_annotations.extend(port_variants(variant_annos, sel)) for a in new_annotations: if not (a.get('item_container') and a.get('container_id')): a['container_id'] = container_id tagid = a.pop('tagid', None) or a.pop('data-tagid', None) elems = sel.css(a['selector']) elem = elems[0].root # Update annotations sample['plugins']['annotations-plugin']['extracts'] = new_annotations sample['version'] = SLYBOT_VERSION return sample
def port_sample(sample): """Convert slybot samples made before slybot 0.13 to new format.""" if not sample.get("annotated_body"): if not sample.get("plugins"): sample["plugins"] = {"annotations-plugin": {"extracts": []}} return sample # Handle empty body if not sample.get("plugins"): sample["plugins"] = load_annotations(sample.get("annotated_body", u"")) del sample["annotated_body"] # Group annotations by type annotations = sample["plugins"]["annotations-plugin"]["extracts"] try: sel = Selector(text=add_tagids(sample["original_body"])) except KeyError: annotated = sample["annotated_body"] sample["original_body"] = annotated sel = Selector(text=add_tagids(annotated)) annotations = port_standard(annotations, sel, sample) standard_annos, generated_annos, variant_annos = [], [], [] for a in annotations: if a.get("generated"): generated_annos.append(a) elif a.get("variants", 0) > 0: variant_annos.append(a) else: standard_annos.append(a) if not annotations: return sample new_annotations = [] a = find_element(annotations[0], sel) for b in annotations[1:]: b = find_element(b, sel) a = find_common_parent(a, b) container_id = gen_id() parent = a.getparent() new_annotations.append(_create_container(a if parent is None else parent, container_id, selector=sel)) for a in standard_annos: a.pop("variant", None) new_annotations.extend(standard_annos) new_annotations.extend(port_generated(generated_annos, sel)) new_annotations.extend(port_variants(variant_annos, sel)) for a in new_annotations: if not (a.get("item_container") and a.get("container_id")): a["container_id"] = container_id tagid = a.pop("tagid", None) or a.pop("data-tagid", None) elems = sel.css(a["selector"]) elem = elems[0].root # Update annotations sample["plugins"]["annotations-plugin"]["extracts"] = new_annotations sample["version"] = SLYBOT_VERSION return sample
def port_sample(sample): """Convert slybot samples made before slybot 0.13 to new format.""" if not sample.get('annotated_body'): return sample # Handle empty body if not sample.get('plugins'): sample['plugins'] = load_annotations(sample.get('annotated_body', u'')) del sample['annotated_body'] # Group annotations by type annotations = sample['plugins']['annotations-plugin']['extracts'] sel = Selector(text=add_tagids(sample['original_body'])) annotations = port_standard(annotations, sel, sample) standard_annos, generated_annos, variant_annos = [], [], [] for a in annotations: if a.get('generated'): generated_annos.append(a) elif a.get('variants', 0) > 0: variant_annos.append(a) else: standard_annos.append(a) new_annotations = [] for a in standard_annos: a.pop('variant', None) new_annotations.extend(standard_annos) new_annotations.extend(port_generated(generated_annos, sel)) new_annotations.extend(port_variants(variant_annos, sel)) # Update annotations sample['plugins']['annotations-plugin']['extracts'] = new_annotations sample['version'] = SLYBOT_VERSION return sample
def load_annotations(body): """Create slybot annotations from annotated html.""" if not body: return {'annotations-plugin': {'extracts': []}} sel = Selector(text=add_tagids(body)) existing_ids = set() annotations = [] for elem in sel.xpath('//*[@data-scrapy-annotate]'): attributes = elem._root.attrib annotation = json.loads(unquote(attributes['data-scrapy-annotate'])) if (isinstance(elem._root, _Element) and elem._root.tag.lower() == 'ins'): annotation.update(find_generated_annotation(elem)) else: annotation['tagid'] = attributes.get('data-tagid') if 'id' not in annotation: annotation['id'] = gen_id(disallow=existing_ids) existing_ids.add(annotation['id']) annotations.append(annotation) for elem in sel.xpath('//*[@%s]' % '|@'.join(IGNORE_ATTRIBUTES)): attributes = elem._root.attrib for attribute in IGNORE_ATTRIBUTES: if attribute in attributes: break ignore = {attribute[len('data-scrapy-'):]: True} if 'id' not in ignore: ignore['id'] = gen_id(disallow=existing_ids) existing_ids.add(ignore['id']) annotations.append(ignore) return {'annotations-plugin': {'extracts': annotations}}
def load_annotations(body): """Create slybot annotations from annotated html.""" if not body: return {'annotations-plugin': {'extracts': []}} sel = Selector(text=add_tagids(body)) existing_ids = set() annotations = [] for elem in sel.xpath('//*[@data-scrapy-annotate]'): attributes = elem.root.attrib annotation = json.loads(unquote(attributes['data-scrapy-annotate'])) if (isinstance(elem.root, _Element) and elem.root.tag.lower() == 'ins'): annotation.update(find_generated_annotation(elem)) else: annotation['tagid'] = attributes.get('data-tagid') if 'id' not in annotation: annotation['id'] = gen_id(disallow=existing_ids) existing_ids.add(annotation['id']) annotations.append(annotation) for elem in sel.xpath('//*[@%s]' % '|@'.join(IGNORE_ATTRIBUTES)): attributes = elem.root.attrib for attribute in IGNORE_ATTRIBUTES: if attribute in attributes: break ignore = {attribute[len('data-scrapy-'):]: True} if 'id' not in ignore: ignore['id'] = gen_id(disallow=existing_ids) existing_ids.add(ignore['id']) annotations.append(ignore) return {'annotations-plugin': {'extracts': annotations}}
def load_annotations(body): """Create slybot annotations from annotated html.""" if not body: return {"annotations-plugin": {"extracts": []}} sel = Selector(text=add_tagids(body)) existing_ids = set() annotations = [] for elem in sel.xpath("//*[@data-scrapy-annotate]"): attributes = elem.root.attrib annotation = json.loads(unquote(attributes["data-scrapy-annotate"])) if isinstance(elem.root, _Element) and elem.root.tag.lower() == "ins": annotation.update(find_generated_annotation(elem)) else: annotation["tagid"] = attributes.get("data-tagid") if "id" not in annotation: annotation["id"] = gen_id(disallow=existing_ids) existing_ids.add(annotation["id"]) annotations.append(annotation) for elem in sel.xpath("//*[@%s]" % "|@".join(IGNORE_ATTRIBUTES)): attributes = elem.root.attrib for attribute in IGNORE_ATTRIBUTES: if attribute in attributes: break ignore = {attribute[len("data-scrapy-") :]: True} if "id" not in ignore: ignore["id"] = gen_id(disallow=existing_ids) existing_ids.add(ignore["id"]) annotations.append(ignore) return {"annotations-plugin": {"extracts": annotations}}
def port_sample(sample): """Convert slybot samples made before slybot 0.13 to new format.""" if not sample.get("annotated_body"): return sample # Handle empty body if not sample.get("plugins"): sample["plugins"] = load_annotations(sample.get("annotated_body", u"")) del sample["annotated_body"] # Group annotations by type annotations = sample["plugins"]["annotations-plugin"]["extracts"] sel = Selector(text=add_tagids(sample["original_body"])) annotations = port_standard(annotations, sel, sample) standard_annos, generated_annos, variant_annos = [], [], [] for a in annotations: if a.get("generated"): generated_annos.append(a) elif a.get("variants", 0) > 0: variant_annos.append(a) else: standard_annos.append(a) new_annotations = [] for a in standard_annos: a.pop("variant", None) new_annotations.extend(standard_annos) new_annotations.extend(port_generated(generated_annos, sel)) new_annotations.extend(port_variants(variant_annos, sel)) # Update annotations sample["plugins"]["annotations-plugin"]["extracts"] = new_annotations sample["version"] = SLYBOT_VERSION return sample
def port_sample(sample, schemas=None): """Convert slybot samples made before slybot 0.13 to new format.""" if schemas is None: schemas = {} container_id = gen_id() default_annotations = [_create_container('body', container_id)] if not sample.get('annotated_body') and not sample.get('plugins'): sample['plugins'] = { 'annotations-plugin': { 'extracts': default_annotations } } return sample if not sample.get('plugins'): sample['plugins'] = load_annotations(sample.get('annotated_body', u'')) sample.pop('annotated_body', None) # Group annotations by type annotations = sample['plugins']['annotations-plugin']['extracts'] try: sel = Selector(text=add_tagids(sample['original_body'])) except KeyError: annotated = sample.get('annotated_body', u'') sample['original_body'] = annotated try: tagged = add_tagids(annotated) except KeyError: tagged = u'' sel = Selector(text=tagged) annotations = port_standard(annotations, sel, sample) standard_annos, generated_annos, variant_annos = [], [], [] for a in annotations: if a.get('generated'): generated_annos.append(a) elif a.get('variants', 0) > 0: variant_annos.append(a) else: standard_annos.append(a) if not annotations: sample['plugins'] = { 'annotations-plugin': { 'extracts': default_annotations } } return sample new_annotations = [] a = find_element(annotations[0], sel) for b in annotations[1:]: b = find_element(b, sel) a = find_common_parent(a, b) parent = a.getparent() container = _create_container(a if parent is None else parent, container_id, selector=sel) new_annotations.append(container) for a in standard_annos: a.pop('variant', None) new_annotations.extend(standard_annos) new_annotations.extend(port_generated(generated_annos, sel)) new_annotations.extend(port_variants(variant_annos, sel)) for a in new_annotations: if not (a.get('item_container') and a.get('container_id')): a['container_id'] = container_id a.pop('tagid', None) or a.pop('data-tagid', None) # Update annotations sample['plugins']['annotations-plugin']['extracts'] = new_annotations sample['version'] = SLYBOT_VERSION schema_id = guess_schema(sample, schemas) container['schema_id'] = schema_id return sample