Example #1
0
def port_sample(sample):
    """Convert slybot samples made before slybot 0.13 to new format."""
    if not sample.get('annotated_body'):
        if not sample.get('plugins'):
            sample['plugins'] = {'annotations-plugin': {'extracts': []}}
        return sample  # Handle empty body
    if not sample.get('plugins'):
        sample['plugins'] = load_annotations(sample.get('annotated_body', u''))
    del sample['annotated_body']

    # Group annotations by type
    annotations = sample['plugins']['annotations-plugin']['extracts']
    try:
        sel = Selector(text=add_tagids(sample['original_body']))
    except KeyError:
        annotated = sample['annotated_body']
        sample['original_body'] = annotated
        sel = Selector(text=add_tagids(annotated))
    annotations = port_standard(annotations, sel, sample)
    standard_annos, generated_annos, variant_annos = [], [], []
    for a in annotations:
        if a.get('generated'):
            generated_annos.append(a)
        elif a.get('variants', 0) > 0:
            variant_annos.append(a)
        else:
            standard_annos.append(a)
    if not annotations:
        return sample
    new_annotations = []
    a = find_element(annotations[0], sel)
    for b in annotations[1:]:
        b = find_element(b, sel)
        a = find_common_parent(a, b)
    container_id = gen_id()
    parent = a.getparent()
    new_annotations.append(
        _create_container(a if parent is None else parent, container_id,
                          selector=sel))
    for a in standard_annos:
        a.pop('variant', None)
    new_annotations.extend(standard_annos)
    new_annotations.extend(port_generated(generated_annos, sel))
    new_annotations.extend(port_variants(variant_annos, sel))
    for a in new_annotations:
        if not (a.get('item_container') and a.get('container_id')):
            a['container_id'] = container_id
        tagid = a.pop('tagid', None) or a.pop('data-tagid', None)
        elems = sel.css(a['selector'])
        elem = elems[0].root
    # Update annotations
    sample['plugins']['annotations-plugin']['extracts'] = new_annotations
    sample['version'] = SLYBOT_VERSION
    return sample
Example #2
0
def port_sample(sample):
    """Convert slybot samples made before slybot 0.13 to new format."""
    if not sample.get("annotated_body"):
        if not sample.get("plugins"):
            sample["plugins"] = {"annotations-plugin": {"extracts": []}}
        return sample  # Handle empty body
    if not sample.get("plugins"):
        sample["plugins"] = load_annotations(sample.get("annotated_body", u""))
    del sample["annotated_body"]

    # Group annotations by type
    annotations = sample["plugins"]["annotations-plugin"]["extracts"]
    try:
        sel = Selector(text=add_tagids(sample["original_body"]))
    except KeyError:
        annotated = sample["annotated_body"]
        sample["original_body"] = annotated
        sel = Selector(text=add_tagids(annotated))
    annotations = port_standard(annotations, sel, sample)
    standard_annos, generated_annos, variant_annos = [], [], []
    for a in annotations:
        if a.get("generated"):
            generated_annos.append(a)
        elif a.get("variants", 0) > 0:
            variant_annos.append(a)
        else:
            standard_annos.append(a)
    if not annotations:
        return sample
    new_annotations = []
    a = find_element(annotations[0], sel)
    for b in annotations[1:]:
        b = find_element(b, sel)
        a = find_common_parent(a, b)
    container_id = gen_id()
    parent = a.getparent()
    new_annotations.append(_create_container(a if parent is None else parent, container_id, selector=sel))
    for a in standard_annos:
        a.pop("variant", None)
    new_annotations.extend(standard_annos)
    new_annotations.extend(port_generated(generated_annos, sel))
    new_annotations.extend(port_variants(variant_annos, sel))
    for a in new_annotations:
        if not (a.get("item_container") and a.get("container_id")):
            a["container_id"] = container_id
        tagid = a.pop("tagid", None) or a.pop("data-tagid", None)
        elems = sel.css(a["selector"])
        elem = elems[0].root
    # Update annotations
    sample["plugins"]["annotations-plugin"]["extracts"] = new_annotations
    sample["version"] = SLYBOT_VERSION
    return sample
Example #3
0
def port_sample(sample):
    """Convert slybot samples made before slybot 0.13 to new format."""
    if not sample.get('annotated_body'):
        return sample  # Handle empty body
    if not sample.get('plugins'):
        sample['plugins'] = load_annotations(sample.get('annotated_body', u''))
    del sample['annotated_body']

    # Group annotations by type
    annotations = sample['plugins']['annotations-plugin']['extracts']
    sel = Selector(text=add_tagids(sample['original_body']))
    annotations = port_standard(annotations, sel, sample)
    standard_annos, generated_annos, variant_annos = [], [], []
    for a in annotations:
        if a.get('generated'):
            generated_annos.append(a)
        elif a.get('variants', 0) > 0:
            variant_annos.append(a)
        else:
            standard_annos.append(a)
    new_annotations = []
    for a in standard_annos:
        a.pop('variant', None)
    new_annotations.extend(standard_annos)
    new_annotations.extend(port_generated(generated_annos, sel))
    new_annotations.extend(port_variants(variant_annos, sel))

    # Update annotations
    sample['plugins']['annotations-plugin']['extracts'] = new_annotations
    sample['version'] = SLYBOT_VERSION
    return sample
Example #4
0
def load_annotations(body):
    """Create slybot annotations from annotated html."""
    if not body:
        return {'annotations-plugin': {'extracts': []}}
    sel = Selector(text=add_tagids(body))
    existing_ids = set()
    annotations = []
    for elem in sel.xpath('//*[@data-scrapy-annotate]'):
        attributes = elem._root.attrib
        annotation = json.loads(unquote(attributes['data-scrapy-annotate']))
        if (isinstance(elem._root, _Element) and
                elem._root.tag.lower() == 'ins'):
            annotation.update(find_generated_annotation(elem))
        else:
            annotation['tagid'] = attributes.get('data-tagid')
        if 'id' not in annotation:
            annotation['id'] = gen_id(disallow=existing_ids)
        existing_ids.add(annotation['id'])
        annotations.append(annotation)
    for elem in sel.xpath('//*[@%s]' % '|@'.join(IGNORE_ATTRIBUTES)):
        attributes = elem._root.attrib
        for attribute in IGNORE_ATTRIBUTES:
            if attribute in attributes:
                break
        ignore = {attribute[len('data-scrapy-'):]: True}
        if 'id' not in ignore:
            ignore['id'] = gen_id(disallow=existing_ids)
        existing_ids.add(ignore['id'])
        annotations.append(ignore)
    return {'annotations-plugin': {'extracts': annotations}}
Example #5
0
def load_annotations(body):
    """Create slybot annotations from annotated html."""
    if not body:
        return {'annotations-plugin': {'extracts': []}}
    sel = Selector(text=add_tagids(body))
    existing_ids = set()
    annotations = []
    for elem in sel.xpath('//*[@data-scrapy-annotate]'):
        attributes = elem.root.attrib
        annotation = json.loads(unquote(attributes['data-scrapy-annotate']))
        if (isinstance(elem.root, _Element) and
                elem.root.tag.lower() == 'ins'):
            annotation.update(find_generated_annotation(elem))
        else:
            annotation['tagid'] = attributes.get('data-tagid')
        if 'id' not in annotation:
            annotation['id'] = gen_id(disallow=existing_ids)
        existing_ids.add(annotation['id'])
        annotations.append(annotation)
    for elem in sel.xpath('//*[@%s]' % '|@'.join(IGNORE_ATTRIBUTES)):
        attributes = elem.root.attrib
        for attribute in IGNORE_ATTRIBUTES:
            if attribute in attributes:
                break
        ignore = {attribute[len('data-scrapy-'):]: True}
        if 'id' not in ignore:
            ignore['id'] = gen_id(disallow=existing_ids)
        existing_ids.add(ignore['id'])
        annotations.append(ignore)
    return {'annotations-plugin': {'extracts': annotations}}
Example #6
0
def port_sample(sample):
    """Convert slybot samples made before slybot 0.13 to new format."""
    if not sample.get('annotated_body'):
        return sample  # Handle empty body
    if not sample.get('plugins'):
        sample['plugins'] = load_annotations(sample.get('annotated_body', u''))
    del sample['annotated_body']

    # Group annotations by type
    annotations = sample['plugins']['annotations-plugin']['extracts']
    sel = Selector(text=add_tagids(sample['original_body']))
    annotations = port_standard(annotations, sel, sample)
    standard_annos, generated_annos, variant_annos = [], [], []
    for a in annotations:
        if a.get('generated'):
            generated_annos.append(a)
        elif a.get('variants', 0) > 0:
            variant_annos.append(a)
        else:
            standard_annos.append(a)
    new_annotations = []
    for a in standard_annos:
        a.pop('variant', None)
    new_annotations.extend(standard_annos)
    new_annotations.extend(port_generated(generated_annos, sel))
    new_annotations.extend(port_variants(variant_annos, sel))

    # Update annotations
    sample['plugins']['annotations-plugin']['extracts'] = new_annotations
    sample['version'] = SLYBOT_VERSION
    return sample
Example #7
0
def load_annotations(body):
    """Create slybot annotations from annotated html."""
    if not body:
        return {"annotations-plugin": {"extracts": []}}
    sel = Selector(text=add_tagids(body))
    existing_ids = set()
    annotations = []
    for elem in sel.xpath("//*[@data-scrapy-annotate]"):
        attributes = elem.root.attrib
        annotation = json.loads(unquote(attributes["data-scrapy-annotate"]))
        if isinstance(elem.root, _Element) and elem.root.tag.lower() == "ins":
            annotation.update(find_generated_annotation(elem))
        else:
            annotation["tagid"] = attributes.get("data-tagid")
        if "id" not in annotation:
            annotation["id"] = gen_id(disallow=existing_ids)
        existing_ids.add(annotation["id"])
        annotations.append(annotation)
    for elem in sel.xpath("//*[@%s]" % "|@".join(IGNORE_ATTRIBUTES)):
        attributes = elem.root.attrib
        for attribute in IGNORE_ATTRIBUTES:
            if attribute in attributes:
                break
        ignore = {attribute[len("data-scrapy-") :]: True}
        if "id" not in ignore:
            ignore["id"] = gen_id(disallow=existing_ids)
        existing_ids.add(ignore["id"])
        annotations.append(ignore)
    return {"annotations-plugin": {"extracts": annotations}}
Example #8
0
def port_sample(sample):
    """Convert slybot samples made before slybot 0.13 to new format."""
    if not sample.get("annotated_body"):
        return sample  # Handle empty body
    if not sample.get("plugins"):
        sample["plugins"] = load_annotations(sample.get("annotated_body", u""))
    del sample["annotated_body"]

    # Group annotations by type
    annotations = sample["plugins"]["annotations-plugin"]["extracts"]
    sel = Selector(text=add_tagids(sample["original_body"]))
    annotations = port_standard(annotations, sel, sample)
    standard_annos, generated_annos, variant_annos = [], [], []
    for a in annotations:
        if a.get("generated"):
            generated_annos.append(a)
        elif a.get("variants", 0) > 0:
            variant_annos.append(a)
        else:
            standard_annos.append(a)
    new_annotations = []
    for a in standard_annos:
        a.pop("variant", None)
    new_annotations.extend(standard_annos)
    new_annotations.extend(port_generated(generated_annos, sel))
    new_annotations.extend(port_variants(variant_annos, sel))

    # Update annotations
    sample["plugins"]["annotations-plugin"]["extracts"] = new_annotations
    sample["version"] = SLYBOT_VERSION
    return sample
Example #9
0
def port_sample(sample, schemas=None):
    """Convert slybot samples made before slybot 0.13 to new format."""
    if schemas is None:
        schemas = {}
    container_id = gen_id()
    default_annotations = [_create_container('body', container_id)]
    if not sample.get('annotated_body') and not sample.get('plugins'):
        sample['plugins'] = {
            'annotations-plugin': {
                'extracts': default_annotations
            }
        }
        return sample
    if not sample.get('plugins'):
        sample['plugins'] = load_annotations(sample.get('annotated_body', u''))
    sample.pop('annotated_body', None)

    # Group annotations by type
    annotations = sample['plugins']['annotations-plugin']['extracts']
    try:
        sel = Selector(text=add_tagids(sample['original_body']))
    except KeyError:
        annotated = sample.get('annotated_body', u'')
        sample['original_body'] = annotated
        try:
            tagged = add_tagids(annotated)
        except KeyError:
            tagged = u''
        sel = Selector(text=tagged)
    annotations = port_standard(annotations, sel, sample)
    standard_annos, generated_annos, variant_annos = [], [], []
    for a in annotations:
        if a.get('generated'):
            generated_annos.append(a)
        elif a.get('variants', 0) > 0:
            variant_annos.append(a)
        else:
            standard_annos.append(a)
    if not annotations:
        sample['plugins'] = {
            'annotations-plugin': {
                'extracts': default_annotations
            }
        }
        return sample
    new_annotations = []
    a = find_element(annotations[0], sel)
    for b in annotations[1:]:
        b = find_element(b, sel)
        a = find_common_parent(a, b)
    parent = a.getparent()
    container = _create_container(a if parent is None else parent,
                                  container_id,
                                  selector=sel)
    new_annotations.append(container)
    for a in standard_annos:
        a.pop('variant', None)
    new_annotations.extend(standard_annos)
    new_annotations.extend(port_generated(generated_annos, sel))
    new_annotations.extend(port_variants(variant_annos, sel))
    for a in new_annotations:
        if not (a.get('item_container') and a.get('container_id')):
            a['container_id'] = container_id
        a.pop('tagid', None) or a.pop('data-tagid', None)
    # Update annotations
    sample['plugins']['annotations-plugin']['extracts'] = new_annotations
    sample['version'] = SLYBOT_VERSION
    schema_id = guess_schema(sample, schemas)
    container['schema_id'] = schema_id
    return sample