Python parse_html Examples, scrapely.htmlpage.parse_html Python Examples

Example #1

0

Show file

File: test_htmlpage.py Project: samucc/scrapely

 def test_special_cases(self):
     """some special cases tests"""
     parsed = list(parse_html("<meta http-equiv='Pragma' content='no-cache' />"))
     self.assertEqual(parsed[0].attributes, {'content': 'no-cache', 'http-equiv': 'Pragma'})
     parsed = list(parse_html("<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en' lang='en'>"))
     self.assertEqual(parsed[0].attributes, {'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': 'en', 'lang': 'en'})
     parsed = list(parse_html("<IMG SRC='http://images.play.com/banners/SAM550a.jpg' align='left' / hspace=5>"))
     self.assertEqual(parsed[0].attributes, {'src': 'http://images.play.com/banners/SAM550a.jpg', \
                                             'align': 'left', 'hspace': '5', '/': None})

Example #2

0

Show file

File: test_htmlpage.py Project: 1060460048/scrapely

 def test_special_cases(self):
     """some special cases tests"""
     parsed = list(parse_html("<meta http-equiv='Pragma' content='no-cache' />"))
     self.assertEqual(parsed[0].attributes, {'content': 'no-cache', 'http-equiv': 'Pragma'})
     parsed = list(parse_html("<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en' lang='en'>"))
     self.assertEqual(parsed[0].attributes, {'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': 'en', 'lang': 'en'})
     parsed = list(parse_html("<IMG SRC='http://images.play.com/banners/SAM550a.jpg' align='left' / hspace=5>"))
     self.assertEqual(parsed[0].attributes, {'src': 'http://images.play.com/banners/SAM550a.jpg', \
                                             'align': 'left', 'hspace': '5', '/': None})

Example #3

0

Show file

File: test_htmlpage.py Project: flyeven/scrapely

 def test_special_cases(self):
     """some special cases tests"""
     parsed = list(parse_html("<meta http-equiv='Pragma' content='no-cache' />"))
     self.assertEqual(parsed[0].attributes, {"content": "no-cache", "http-equiv": "Pragma"})
     parsed = list(parse_html("<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en' lang='en'>"))
     self.assertEqual(
         parsed[0].attributes, {"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": "en", "lang": "en"}
     )
     parsed = list(parse_html("<IMG SRC='http://images.play.com/banners/SAM550a.jpg' align='left' / hspace=5>"))
     self.assertEqual(
         parsed[0].attributes,
         {"src": "http://images.play.com/banners/SAM550a.jpg", "align": "left", "hspace": "5", "/": None},
     )

Example #4

0

Show file

File: test_htmlpage.py Project: flyeven/scrapely

    def _test_sample(self, source, expected_parsed, samplecount=None):
        parsed = parse_html(source)
        count_element = 0
        count_expected = 0
        for element in parsed:
            if type(element) == HtmlTag:
                count_element += 1
            expected = expected_parsed.pop(0)
            if type(expected) == HtmlTag:
                count_expected += 1
            element_text = source[element.start : element.end]
            expected_text = source[expected.start : expected.end]
            if element.start != expected.start or element.end != expected.end:
                errstring = "[%s,%s] %s != [%s,%s] %s" % (
                    element.start,
                    element.end,
                    element_text,
                    expected.start,
                    expected.end,
                    expected_text,
                )
                if samplecount is not None:
                    errstring += " (sample %d)" % samplecount
                assert False, errstring
            if type(element) != type(expected):
                errstring = "(%s) %s != (%s) %s for text\n%s" % (
                    count_element,
                    repr(type(element)),
                    count_expected,
                    repr(type(expected)),
                    element_text,
                )
                if samplecount is not None:
                    errstring += " (sample %d)" % samplecount
                assert False, errstring
            if type(element) == HtmlTag:
                self.assertEqual(element.tag, expected.tag)
                self.assertEqual(element.attributes, expected.attributes)
                self.assertEqual(element.tag_type, expected.tag_type)
            if type(element) == HtmlDataFragment:
                msg = (
                    "Got: %s Expected: %s in sample: %d [%d:%d] (%s)"
                    % (
                        element.is_text_content,
                        expected.is_text_content,
                        samplecount,
                        element.start,
                        element.end,
                        repr(element_text),
                    )
                    if samplecount is not None
                    else None
                )
                self.assertEqual(element.is_text_content, expected.is_text_content, msg)

        if expected_parsed:
            errstring = "Expected %s" % repr(expected_parsed)
            if samplecount is not None:
                errstring += " (sample %d)" % samplecount
            assert False, errstring

Example #5

0

Show file

File: html.py Project: zirconer/portia

def descriptify(doc, base=None, proxy=None):
    """Clean JavaScript in a html source string.
    """
    parsed = parse_html(doc)
    newdoc = []
    inserted_comment = False
    for element in parsed:
        if isinstance(element, HtmlTag):
            if element.tag in BLOCKED_TAGNAMES:
                # Asumes there are no void elements in BLOCKED_TAGNAMES
                # http://www.w3.org/TR/html5/syntax.html#void-elements
                if not inserted_comment and element.tag_type in (
                        HtmlTagType.OPEN_TAG, HtmlTagType.UNPAIRED_TAG):
                    newdoc.append('<%s>' % element.tag)
                    inserted_comment = True
                elif element.tag_type == HtmlTagType.CLOSE_TAG:
                    newdoc.append('</%s>' % element.tag)
                    inserted_comment = False
            elif element.tag == 'base':
                element.attributes = {}
                newdoc.append(serialize_tag(element))
            else:
                for key, val in element.attributes.copy().items():
                    # Empty intrinsic events
                    if key.startswith('on') or key == "http-equiv":
                        element.attributes[key] = ""
                    elif base and proxy and key == "style" and val is not None:
                        element.attributes[key] = process_css(val, -1, base)
                    elif element.tag in ('frame', 'iframe') and key == 'src':
                        element.attributes[
                            key] = '/static/frames-not-supported.html'
                    # Rewrite javascript URIs
                    elif key in URI_ATTRIBUTES and val is not None:
                        if _contains_js(unescape(val)):
                            element.attributes[key] = "#"
                        elif base and proxy and not (element.tag == "a"
                                                     and key == 'href'):
                            element.attributes[key] = wrap_url(val, -1, base)
                            element.attributes['_portia_%s' % key] = val
                        elif base:
                            element.attributes[key] = urljoin(base, val)
                newdoc.append(serialize_tag(element))
        else:
            text = doc[element.start:element.end]
            if inserted_comment and text.strip():
                newdoc.append('<!-- Removed by portia -->')
            else:
                newdoc.append(text)

    return ''.join(newdoc)

Example #6

0

Show file

File: html.py Project: Kumangus/portia

def descriptify(doc, base=None, proxy=None):
    """Clean JavaScript in a html source string.
    """
    parsed = parse_html(doc)
    newdoc = []
    inserted_comment = False
    for element in parsed:
        if isinstance(element, HtmlTag):
            if element.tag in BLOCKED_TAGNAMES:
                # Asumes there are no void elements in BLOCKED_TAGNAMES
                # http://www.w3.org/TR/html5/syntax.html#void-elements
                if not inserted_comment and element.tag_type in (HtmlTagType.OPEN_TAG, HtmlTagType.UNPAIRED_TAG):
                    newdoc.append('<%s>' % element.tag)
                    inserted_comment = True
                elif element.tag_type == HtmlTagType.CLOSE_TAG:
                    newdoc.append('</%s>' % element.tag)
                    inserted_comment = False
            elif element.tag == 'base':
                element.attributes = {}
                newdoc.append(serialize_tag(element))
            else:
                for key, val in element.attributes.copy().items():
                    # Empty intrinsic events
                    if key.startswith('on') or key == "http-equiv":
                        element.attributes[key] = ""
                    elif base and proxy and key == "style" and val is not None:
                        element.attributes[key] = process_css(val, -1, base)
                    elif element.tag in ('frame', 'iframe') and key == 'src':
                        element.attributes[key] = '/static/frames-not-supported.html'
                    # Rewrite javascript URIs
                    elif key in URI_ATTRIBUTES and val is not None:
                            if _contains_js(unescape(val)):
                                element.attributes[key] = "#"
                            elif base and proxy and not (element.tag == "a" and key == 'href'):
                                element.attributes[key] = wrap_url(val, -1,
                                                                   base)
                                element.attributes['_portia_%s' % key] = val
                            elif base:
                                element.attributes[key] = urljoin(base, val)
                newdoc.append(serialize_tag(element))
        else:
            text = doc[element.start:element.end]
            if inserted_comment and text.strip():
                newdoc.append('<!-- Removed by portia -->')
            else:
                newdoc.append(text)

    return ''.join(newdoc)

Example #7

0

Show file

File: baseurl.py Project: zsmj513/portia

def insert_base_url(html, base):
    """
    Inserts the given base url if does not exist in html source,
    or replace the existing if needed
    """
    baseurl = baseelement = headelement = htmlelement = None
    for element in parse_html(html):
        if getattr(element, "tag", None) == "base":
            baseurl = element.attributes.get("href", None)
            baseelement = element
        elif getattr(element, "tag", None) == "head" and \
                element.tag_type == HtmlTagType.OPEN_TAG:
            headelement = element
        elif getattr(element, "tag", None) == "html" and \
                element.tag_type == HtmlTagType.OPEN_TAG:
            htmlelement = element

    if baseurl:
        if not _is_abs_url(baseurl):
            absurl = urlparse.urljoin(base, baseurl)
            # replace original base tag
            basetag = '<base href="%s" />' % absurl
            html = html[:baseelement.start] + basetag + html[baseelement.end:]

    else:
        # Generate new base element and include
        basetag = '<base href="%s" />' % base
        if headelement:
            insertpos = headelement.end
        else:
            if htmlelement:
                basetag = "\n<head>%s</head>\n" % basetag
                insertpos = htmlelement.end
            else:
                doctype_match = DOCTYPERE.search(html)
                if doctype_match:
                    insertpos = doctype_match.end()
                else:
                    insertpos = 0
        html = html[:insertpos] + basetag + html[insertpos:]

    return html

Example #8

0

Show file

    def _test_sample(self, source, expected_parsed, samplecount=None):
        parsed = parse_html(source)
        count_element = 0
        count_expected = 0
        for element in parsed:
            if type(element) == HtmlTag:
                count_element += 1
            expected = expected_parsed.pop(0)
            if type(expected) == HtmlTag:
                count_expected += 1
            element_text = source[element.start:element.end]
            expected_text = source[expected.start:expected.end]
            if element.start != expected.start or element.end != expected.end:
                errstring = "[%s,%s] %s != [%s,%s] %s" % (element.start, \
                    element.end, element_text, expected.start, \
                    expected.end, expected_text)
                if samplecount is not None:
                    errstring += " (sample %d)" % samplecount
                assert False, errstring
            if type(element) != type(expected):
                errstring = "(%s) %s != (%s) %s for text\n%s" % (count_element, \
                    repr(type(element)), count_expected, repr(type(expected)), element_text)
                if samplecount is not None:
                    errstring += " (sample %d)" % samplecount
                assert False, errstring
            if type(element) == HtmlTag:
                self.assertEqual(element.tag, expected.tag)
                self.assertEqual(element.attributes, expected.attributes)
                self.assertEqual(element.tag_type, expected.tag_type)
            if type(element) == HtmlDataFragment:
                msg = "Got: %s Expected: %s in sample: %d [%d:%d] (%s)" % \
                        (element.is_text_content, expected.is_text_content, samplecount, element.start, element.end, repr(element_text)) \
                        if samplecount is not None else None
                self.assertEqual(element.is_text_content,
                                 expected.is_text_content, msg)

        if expected_parsed:
            errstring = "Expected %s" % repr(expected_parsed)
            if samplecount is not None:
                errstring += " (sample %d)" % samplecount
            assert False, errstring

Example #9

0

Show file

File: html.py Project: yyhTHU/portia

def descriptify(doc):
    """Clean JavaScript in a html source string.
    """
    parsed = parse_html(doc)
    newdoc = []
    inserted_comment = False
    for element in parsed:
        if isinstance(element, HtmlTag):
            if not inserted_comment and element.tag == "script" and element.tag_type == HtmlTagType.OPEN_TAG:
                newdoc.append(_AS_COMMENT_BEGIN +
                              doc[element.start:element.end] + _AS_COMMENT_END)
                inserted_comment = True
            elif element.tag == "script" and element.tag_type == HtmlTagType.CLOSE_TAG:
                if inserted_comment:
                    inserted_comment = False
                newdoc.append(_AS_COMMENT_BEGIN +
                              doc[element.start:element.end] + _AS_COMMENT_END)
            elif element.tag == "noscript":
                newdoc.append(_AS_COMMENT_BEGIN +
                              doc[element.start:element.end] + _AS_COMMENT_END)
            else:
                for key, val in element.attributes.copy().items():
                    # Empty intrinsic events
                    if key in INTRINSIC_EVENT_ATTRIBUTES:
                        element.attributes[key] = ""
                    # Rewrite javascript URIs
                    elif key in URI_ATTRIBUTES and val is not None and "javascript:" in _deentitize_unicode(
                            val):
                        element.attributes[key] = "about:blank"
                    else:
                        continue
                newdoc.append(serialize_tag(element))
        else:
            text = doc[element.start:element.end]
            if inserted_comment and text.strip() and not (
                    text.startswith("<!--") and text.endswith("-->")):
                newdoc.append(_AS_COMMENT_BEGIN + text + _AS_COMMENT_END)
            else:
                newdoc.append(text)

    return ''.join(newdoc)

Example #10

0

Show file

File: html.py Project: 1060460048/portia

def descriptify(doc):
    """Clean JavaScript in a html source string.
    """
    parsed = parse_html(doc)
    newdoc = []
    inserted_comment = False
    for element in parsed:
        if isinstance(element, HtmlTag):
            if not inserted_comment and element.tag == "script" and element.tag_type == HtmlTagType.OPEN_TAG:
                newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END)
                inserted_comment = True
            elif element.tag == "script" and element.tag_type == HtmlTagType.CLOSE_TAG:
                if inserted_comment:
                    inserted_comment = False
                newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END)
            elif element.tag == "noscript":
                newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END)
            else:
                for key, val in element.attributes.copy().items():
                    # Empty intrinsic events
                    if key in INTRINSIC_EVENT_ATTRIBUTES:
                        element.attributes[key] = ""
                    # Rewrite javascript URIs
                    elif key in URI_ATTRIBUTES and val is not None and "javascript:" in _deentitize_unicode(val):
                        element.attributes[key] = "about:blank"
                    else:
                        continue
                newdoc.append(serialize_tag(element))
        else:
            text = doc[element.start:element.end]
            if inserted_comment and text.strip() and not (text.startswith("<!--") and text.endswith("-->")):
                newdoc.append(_AS_COMMENT_BEGIN + text + _AS_COMMENT_END)
            else:
                newdoc.append(text)

    return ''.join(newdoc)

Example #11

0

Show file

File: builder.py Project: daqv/portia-dashboard

    def apply(self):
        selector_annotations, tagid_annotations = self.split()
        inserts, numbered_html = defaultdict(list), self.numbered_html
        if selector_annotations:
            converted_annotations = self.apply_selector(selector_annotations)
            tagid_annotations += converted_annotations
        if not self.legacy:
            tagid_annotations = self.verify(
                [arg_to_iter(a) for a in tagid_annotations])
        target = iter(parse_html(numbered_html))
        output, stack = [], []
        elem = next(target)
        last_id = 0
        # XXX: A dummy element is added to the end so if the last annotation is
        #      generated it will be added to the output
        filtered = defaultdict(list)
        for grouped in tagid_annotations:
            for ann in arg_to_iter(grouped):
                filtered[ann['tagid']].append(ann)
        dummy = [(1e9, [{}])]
        sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()
                                     if k is not None])
        try:
            for aid, annotation_data in chain(sorted_annotations, dummy):
                # Move target until replacement/insertion point
                while True:
                    while not isinstance(elem, HtmlTag) or elem.tag == 'ins':
                        output.append(numbered_html[elem.start:elem.end])
                        elem = next(target)
                    if elem.tag_type in {OPEN_TAG, UNPAIRED_TAG}:
                        last_id = elem.attributes.get(TAGID)
                        stack.append(last_id)
                    if elem.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and stack:
                        if ('__added' not in elem.attributes and
                                last_id is not None and aid is not None and
                                int(last_id) < int(aid)):
                            output.append(numbered_html[elem.start:elem.end])
                            elem.attributes['__added'] = True
                        last_inserted = stack.pop()
                        to_insert = inserts.pop(last_inserted, None)
                        if to_insert:
                            output.extend(to_insert)
                            # Skip all nodes up to the next HtmlTag as these
                            # have already been added
                            while True:
                                elem = next(target)
                                try:
                                    last_id = elem.attributes.get(TAGID,
                                                                  last_id)
                                except AttributeError:
                                    pass
                                if isinstance(elem, HtmlTag):
                                    break
                            continue
                    if (last_id is not None and aid is not None and
                            int(last_id) < int(aid)):
                        if '__added' not in elem.attributes:
                            output.append(numbered_html[elem.start:elem.end])
                            elem.attributes['__added'] = True
                        elem = next(target)
                    else:
                        break

                generated = []
                next_generated = []
                regular_annotations = []
                # Place generated annotations at the end and sort by slice
                for annotation in sorted(annotation_data, key=_annotation_key):
                    if annotation.get('generated'):
                        if annotation.get('insert_after'):
                            next_generated.append(annotation)
                        else:
                            generated.append(annotation)
                    else:
                        regular_annotations.append(annotation)
                # Add annotations data as required
                if regular_annotations:
                    annotation_info = self.generate(regular_annotations)
                    for key, val in annotation_info.items():
                        elem.attributes[key] = val
                next_text_section = ''
                if generated:
                    inner_data, target = tee(target)
                    nodes = _get_inner_nodes(inner_data)
                    next_text_section = self._get_generated(
                        elem, generated, nodes, inserts)
                if next_generated:
                    inner_data, target = tee(target)
                    open_tags = 0 if elem.tag_type == UNPAIRED_TAG else 1
                    nodes = _get_inner_nodes(inner_data, open_tags=open_tags,
                                             insert_after=True)
                    next_text_section = self._get_generated(
                        elem, next_generated, nodes, inserts)

                if '__added' not in elem.attributes:
                    output.append(serialize_tag(elem))
                    elem.attributes['__added'] = True
                # If an <ins> tag has been inserted we need to move forward
                if next_text_section:
                    while True:
                        elem = next(target)
                        if (isinstance(elem, HtmlDataFragment) and
                                elem.is_text_content):
                            break
                        output.append(numbered_html[elem.start:elem.end])
                    output.append(next_text_section)
        # Reached the end of the document
        except StopIteration:
            output.append(numbered_html[elem.start:elem.end])
        else:
            for element in target:
                output.append(numbered_html[element.start:element.end])
        return remove_tagids(''.join(output))

Example #12

0

Show file

def apply_annotations(source_html, target_html):
    """
    Applies annotations present in source_html, into
    raw target_html. source_html must be taggered source,
    target_html is the original raw (no tags, no annotations)
    source.
    """
    annotations = _extract_annotations(source_html)
    target_page = HtmlPage(body=target_html)
    cleansing = _get_cleansing(target_page, annotations)

    numbered_html = add_tagids(target_page)
    target = parse_html(numbered_html)
    output = []

    element = target.next()
    eof = False
    while not (isinstance(element, HtmlTag) and TAGID in element.attributes):
        output.append(numbered_html[element.start:element.end])
        element = target.next()
    last_id = element.attributes[TAGID]
    for i in range(len(annotations)):

        annotation = annotations[i]
        # look up replacement/insertion point
        aid = _get_data_id(annotation)
        # move target until replacement/insertion point
        while int(last_id) < int(aid):
            output.append(numbered_html[element.start:element.end])
            element = target.next()
            while not (isinstance(element, HtmlTag) and
                       TAGID in element.attributes):
                output.append(numbered_html[element.start:element.end])
                element = target.next()
            last_id = element.attributes[TAGID]

        # replace/insert in target
        if isinstance(annotation, HtmlTag):
            for key, val in annotation.attributes.items():
                if key.startswith("data-scrapy-"):
                    element.attributes[key] = val
            output.append(serialize_tag(element))
            if not (i + 1 < len(annotations) and
                    _get_data_id(annotations[i + 1]) == aid):
                element = target.next()

        else:  # partial annotation
            closing_tags = _get_closing_tags(annotation)
            if not (i > 0 and _get_data_id(annotations[i - 1]) == aid):
                output.append(numbered_html[element.start:element.end])
                while closing_tags > 0:
                    element = target.next()
                    output.append(numbered_html[element.start:element.end])
                    if isinstance(element, HtmlTag) and \
                            element.tag_type == HtmlTagType.CLOSE_TAG:
                        closing_tags -= 1

            elif (i > 0 and isinstance(annotations[i - 1], HtmlTag) and
                    annotation[0].start > annotations[i - 1].end):
                element = target.next()
                while closing_tags > 0:
                    output.append(numbered_html[element.start:element.end])
                    element = target.next()
                    if isinstance(element, HtmlTag) and \
                            element.tag_type == HtmlTagType.CLOSE_TAG:
                        closing_tags -= 1

                output.append(numbered_html[element.start:element.end])

            num_tags_inside = 0
            partial_output = ""

            # computes number of tags inside a partial annotation
            for p in annotation:
                partial_output += source_html[p.start:p.end]
                if isinstance(p, HtmlTag):
                    num_tags_inside += 1
                    if "insert-after" in p.attributes:
                        num_tags_inside -= 2

            if aid in cleansing:
                partial_output, fix_tag_count = _merge_code(
                    partial_output, cleansing[aid])
                num_tags_inside += fix_tag_count

            output.append(partial_output)

            element = target.next()  # consume reference tag

            # consume the tags inside partial annotation
            while num_tags_inside > 0:
                if isinstance(element, HtmlTag):
                    num_tags_inside -= 1
                element = target.next()

            if not isinstance(element, HtmlTag):
                element = target.next()

        if not (i + 1 < len(annotations) and
                _get_data_id(annotations[i + 1]) == aid):
            try:
                while not (isinstance(element, HtmlTag) and
                           TAGID in element.attributes):
                    output.append(numbered_html[element.start:element.end])
                    element = target.next()
            except StopIteration:
                eof = True
            else:
                last_id = element.attributes[TAGID]

    if not eof:
        output.append(numbered_html[element.start:element.end])
    for element in target:
        output.append(numbered_html[element.start:element.end])

    return remove_tagids(''.join(output))

Example #13

0

Show file

 def test_ignore_xml_declaration(self):
     """Ignore xml declarations inside html"""
     parsed = list(
         parse_html(
             u"<p>The text</p><?xml:namespace blabla/><p>is here</p>"))
     self.assertFalse(parsed[3].is_text_content)

Example #14

0

Show file

File: test_htmlpage.py Project: 1060460048/scrapely

 def test_ignore_xml_declaration(self):
     """Ignore xml declarations inside html"""
     parsed = list(parse_html(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>"))
     self.assertFalse(parsed[3].is_text_content)

Example #15

0

Show file

File: annotations.py Project: AlekseyEf/portia

def apply_annotations(annotations, target_page):
    inserts = defaultdict(list)
    numbered_html = add_tagids(target_page)
    target = parse_html(numbered_html)
    output, tag_stack = [], []

    element = target.next()
    last_id = 0
    # XXX: A dummy element is added to the end so if the last annotation is
    #      generated it will be added to the output
    filtered = defaultdict(list)
    for ann in annotations:
        if ann and ann.get('tagid') and (ann.get('annotations') or
                ann.get('ignore')):
            filtered[ann['tagid']].append(ann)
    dummy = [(1e9, [{}])]
    sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()] +
                                dummy)
    try:
        for aid, annotation_data in sorted_annotations:
            # Move target until replacement/insertion point
            while True:
                while not isinstance(element, HtmlTag):
                    output.append(numbered_html[element.start:element.end])
                    element = target.next()
                if element.tag_type in {OPEN_TAG, UNPAIRED_TAG}:
                    last_id = element.attributes.get(TAGID)
                    tag_stack.append(last_id)
                if element.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and tag_stack:
                    if ('__added' not in element.attributes and
                            int(last_id) < int(aid)):
                        output.append(numbered_html[element.start:element.end])
                        element.attributes['__added'] = True
                    last_inserted = tag_stack.pop()
                    to_insert = inserts.pop(last_inserted, None)
                    if to_insert:
                        output.extend(to_insert)
                        # Skip all nodes up to the next HtmlTag as these
                        # have already been added
                        while True:
                            element = target.next()
                            try:
                                last_id = element.attributes.get(TAGID,
                                                                 last_id)
                            except AttributeError:
                                pass
                            if isinstance(element, HtmlTag):
                                break
                        continue
                if last_id is not None and int(last_id) < int(aid):
                    if '__added' not in element.attributes:
                        output.append(numbered_html[element.start:element.end])
                        element.attributes['__added'] = True
                    element = target.next()
                else:
                    break

            generated = []
            next_generated = []
            # Place generated annotations at the end and sort by slice
            for annotation in sorted(annotation_data, key=_annotation_key):
                if annotation.get('generated'):
                    if annotation.get('insert_after'):
                        next_generated.append(annotation)
                    else:
                        generated.append(annotation)
                else:
                    # Add annotations data as required
                    annotation_info = _gen_annotation_info(annotation)
                    for key, val in annotation_info.items():
                        element.attributes[key] = val
            next_text_section = ''
            if generated:
                inner_data, target = tee(target)
                nodes = _get_inner_nodes(inner_data)
                next_text_section = _get_generated_annotation(
                    element, generated, nodes, numbered_html, inserts)
            if next_generated:
                inner_data, target = tee(target)
                open_tags = 0 if element.tag_type == UNPAIRED_TAG else 1
                nodes = _get_inner_nodes(inner_data, open_tags=open_tags,
                                         insert_after=True)
                next_text_section = _get_generated_annotation(
                    element, next_generated, nodes, numbered_html, inserts)

            if '__added' not in element.attributes:
                output.append(serialize_tag(element))
                element.attributes['__added'] = True
            # If an <ins> tag has been inserted we need to move forward
            if next_text_section:
                while True:
                    elem = target.next()
                    if (isinstance(elem, HtmlDataFragment) and
                            elem.is_text_content):
                        break
                    output.append(numbered_html[elem.start:elem.end])
                output.append(next_text_section)
    # Reached the end of the document
    except StopIteration:
        output.append(numbered_html[element.start:element.end])
    else:
        for element in target:
            output.append(numbered_html[element.start:element.end])
    return remove_tagids(''.join(output))

Example #16

0

Show file

def apply_annotations(annotations, target_page):
    selector_annotations, tagid_annotations = _filter_annotations(annotations)
    inserts = defaultdict(list)
    numbered_html = add_tagids(target_page)
    if selector_annotations:
        converted_annotations = apply_selector_annotations(
            selector_annotations, numbered_html)
        tagid_annotations += converted_annotations
    target = iter(parse_html(numbered_html))
    output, tag_stack = [], []
    element = next(target)
    last_id = 0
    # XXX: A dummy element is added to the end so if the last annotation is
    #      generated it will be added to the output
    filtered = defaultdict(list)
    for ann in tagid_annotations:
        filtered[ann['tagid']].append(ann)
    dummy = [(1e9, [{}])]
    sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()] +
                                dummy)
    try:
        for aid, annotation_data in sorted_annotations:
            # Move target until replacement/insertion point
            while True:
                while not isinstance(element, HtmlTag) or element.tag == 'ins':
                    output.append(numbered_html[element.start:element.end])
                    element = next(target)
                if element.tag_type in {OPEN_TAG, UNPAIRED_TAG}:
                    last_id = element.attributes.get(TAGID)
                    tag_stack.append(last_id)
                if element.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and tag_stack:
                    if ('__added' not in element.attributes and
                            last_id is not None and aid is not None and
                            int(last_id) < int(aid)):
                        output.append(numbered_html[element.start:element.end])
                        element.attributes['__added'] = True
                    last_inserted = tag_stack.pop()
                    to_insert = inserts.pop(last_inserted, None)
                    if to_insert:
                        output.extend(to_insert)
                        # Skip all nodes up to the next HtmlTag as these
                        # have already been added
                        while True:
                            element = next(target)
                            try:
                                last_id = element.attributes.get(TAGID,
                                                                 last_id)
                            except AttributeError:
                                pass
                            if isinstance(element, HtmlTag):
                                break
                        continue
                if (last_id is not None and aid is not None and
                        int(last_id) < int(aid)):
                    if '__added' not in element.attributes:
                        output.append(numbered_html[element.start:element.end])
                        element.attributes['__added'] = True
                    element = next(target)
                else:
                    break

            generated = []
            next_generated = []
            # Place generated annotations at the end and sort by slice
            for annotation in sorted(annotation_data, key=_annotation_key):
                if annotation.get('generated'):
                    if annotation.get('insert_after'):
                        next_generated.append(annotation)
                    else:
                        generated.append(annotation)
                else:
                    # Add annotations data as required
                    annotation_info = _gen_annotation_info(annotation)
                    for key, val in annotation_info.items():
                        element.attributes[key] = val
            next_text_section = ''
            if generated:
                inner_data, target = tee(target)
                nodes = _get_inner_nodes(inner_data)
                next_text_section = _get_generated_annotation(
                    element, generated, nodes, numbered_html, inserts)
            if next_generated:
                inner_data, target = tee(target)
                open_tags = 0 if element.tag_type == UNPAIRED_TAG else 1
                nodes = _get_inner_nodes(inner_data, open_tags=open_tags,
                                         insert_after=True)
                next_text_section = _get_generated_annotation(
                    element, next_generated, nodes, numbered_html, inserts)

            if '__added' not in element.attributes:
                output.append(serialize_tag(element))
                element.attributes['__added'] = True
            # If an <ins> tag has been inserted we need to move forward
            if next_text_section:
                while True:
                    elem = next(target)
                    if (isinstance(elem, HtmlDataFragment) and
                            elem.is_text_content):
                        break
                    output.append(numbered_html[elem.start:elem.end])
                output.append(next_text_section)
    # Reached the end of the document
    except StopIteration:
        output.append(numbered_html[element.start:element.end])
    else:
        for element in target:
            output.append(numbered_html[element.start:element.end])
    return remove_tagids(''.join(output))

Example #17

0

Show file

def _merge_code(code1, code2):
    """merges two pieces of html code by text content alignment."""
    parsed1 = list(parse_html(code1))
    parsed2 = list(parse_html(code2))

    insert_points1 = []
    tags1 = []
    p = 0
    text1 = ""
    for e in parsed1:
        if isinstance(e, HtmlTag):
            insert_points1.append(p)
            tags1.append(e)
        else:
            p += e.end - e.start
            text1 += code1[e.start:e.end]

    insert_points2 = []
    tags2 = []
    p = 0
    text2 = ""
    for e in parsed2:
        if isinstance(e, HtmlTag):
            insert_points2.append(p)
            tags2.append(e)
        else:
            p += e.end - e.start
            text2 += code2[e.start:e.end]

    assert(text1.startswith(text2) or text2.startswith(text1))

    # unique sorted list of insert points
    _insert_points = sorted(insert_points1 + insert_points2)
    insert_points = []
    for i in _insert_points:
        if not i in insert_points:
            insert_points.append(i)

    possible_outs = [""]
    start = 0
    # insert tags in correct order, calculate all alternatives when
    # when order is ambiguous
    for end in insert_points:
        possible_outs = [out + text1[start:end] for out in possible_outs]
        dup_possible_outs = [out for out in possible_outs]
        if end in insert_points1:
            tag1 = tags1.pop(0)
            possible_outs = [out + code1[tag1.start:tag1.end]
                             for out in possible_outs]
        if end in insert_points2:
            tag2 = tags2.pop(0)
            possible_outs = [out + code2[tag2.start:tag2.end]
                             for out in possible_outs]
            if end in insert_points1:
                dup_possible_outs = [out + code2[tag2.start:tag2.end]
                                     for out in dup_possible_outs]
                dup_possible_outs = [out + code1[tag1.start:tag1.end]
                                     for out in dup_possible_outs]
                possible_outs += dup_possible_outs
        start = end

    # choose the first valid
    for out in possible_outs:
        parsed_out = list(parse_html(out))
        if _order_is_valid(parsed_out):
            break

    if text1.startswith(text2):
        out += text1[len(text2):]
    else:
        out += text2[len(text1):]

    tag_count1 = sum(1 for i in parsed1 if isinstance(i, HtmlTag))
    tag_count_final = sum(1 for i in parsed_out if isinstance(i, HtmlTag))

    return out, tag_count_final - tag_count1