def test_special_cases(self): """some special cases tests""" parsed = list(parse_html("<meta http-equiv='Pragma' content='no-cache' />")) self.assertEqual(parsed[0].attributes, {'content': 'no-cache', 'http-equiv': 'Pragma'}) parsed = list(parse_html("<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en' lang='en'>")) self.assertEqual(parsed[0].attributes, {'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': 'en', 'lang': 'en'}) parsed = list(parse_html("<IMG SRC='http://images.play.com/banners/SAM550a.jpg' align='left' / hspace=5>")) self.assertEqual(parsed[0].attributes, {'src': 'http://images.play.com/banners/SAM550a.jpg', \ 'align': 'left', 'hspace': '5', '/': None})
def test_special_cases(self): """some special cases tests""" parsed = list(parse_html("<meta http-equiv='Pragma' content='no-cache' />")) self.assertEqual(parsed[0].attributes, {"content": "no-cache", "http-equiv": "Pragma"}) parsed = list(parse_html("<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en' lang='en'>")) self.assertEqual( parsed[0].attributes, {"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": "en", "lang": "en"} ) parsed = list(parse_html("<IMG SRC='http://images.play.com/banners/SAM550a.jpg' align='left' / hspace=5>")) self.assertEqual( parsed[0].attributes, {"src": "http://images.play.com/banners/SAM550a.jpg", "align": "left", "hspace": "5", "/": None}, )
def _test_sample(self, source, expected_parsed, samplecount=None): parsed = parse_html(source) count_element = 0 count_expected = 0 for element in parsed: if type(element) == HtmlTag: count_element += 1 expected = expected_parsed.pop(0) if type(expected) == HtmlTag: count_expected += 1 element_text = source[element.start : element.end] expected_text = source[expected.start : expected.end] if element.start != expected.start or element.end != expected.end: errstring = "[%s,%s] %s != [%s,%s] %s" % ( element.start, element.end, element_text, expected.start, expected.end, expected_text, ) if samplecount is not None: errstring += " (sample %d)" % samplecount assert False, errstring if type(element) != type(expected): errstring = "(%s) %s != (%s) %s for text\n%s" % ( count_element, repr(type(element)), count_expected, repr(type(expected)), element_text, ) if samplecount is not None: errstring += " (sample %d)" % samplecount assert False, errstring if type(element) == HtmlTag: self.assertEqual(element.tag, expected.tag) self.assertEqual(element.attributes, expected.attributes) self.assertEqual(element.tag_type, expected.tag_type) if type(element) == HtmlDataFragment: msg = ( "Got: %s Expected: %s in sample: %d [%d:%d] (%s)" % ( element.is_text_content, expected.is_text_content, samplecount, element.start, element.end, repr(element_text), ) if samplecount is not None else None ) self.assertEqual(element.is_text_content, expected.is_text_content, msg) if expected_parsed: errstring = "Expected %s" % repr(expected_parsed) if samplecount is not None: errstring += " (sample %d)" % samplecount assert False, errstring
def descriptify(doc, base=None, proxy=None): """Clean JavaScript in a html source string. """ parsed = parse_html(doc) newdoc = [] inserted_comment = False for element in parsed: if isinstance(element, HtmlTag): if element.tag in BLOCKED_TAGNAMES: # Asumes there are no void elements in BLOCKED_TAGNAMES # http://www.w3.org/TR/html5/syntax.html#void-elements if not inserted_comment and element.tag_type in ( HtmlTagType.OPEN_TAG, HtmlTagType.UNPAIRED_TAG): newdoc.append('<%s>' % element.tag) inserted_comment = True elif element.tag_type == HtmlTagType.CLOSE_TAG: newdoc.append('</%s>' % element.tag) inserted_comment = False elif element.tag == 'base': element.attributes = {} newdoc.append(serialize_tag(element)) else: for key, val in element.attributes.copy().items(): # Empty intrinsic events if key.startswith('on') or key == "http-equiv": element.attributes[key] = "" elif base and proxy and key == "style" and val is not None: element.attributes[key] = process_css(val, -1, base) elif element.tag in ('frame', 'iframe') and key == 'src': element.attributes[ key] = '/static/frames-not-supported.html' # Rewrite javascript URIs elif key in URI_ATTRIBUTES and val is not None: if _contains_js(unescape(val)): element.attributes[key] = "#" elif base and proxy and not (element.tag == "a" and key == 'href'): element.attributes[key] = wrap_url(val, -1, base) element.attributes['_portia_%s' % key] = val elif base: element.attributes[key] = urljoin(base, val) newdoc.append(serialize_tag(element)) else: text = doc[element.start:element.end] if inserted_comment and text.strip(): newdoc.append('<!-- Removed by portia -->') else: newdoc.append(text) return ''.join(newdoc)
def descriptify(doc, base=None, proxy=None): """Clean JavaScript in a html source string. """ parsed = parse_html(doc) newdoc = [] inserted_comment = False for element in parsed: if isinstance(element, HtmlTag): if element.tag in BLOCKED_TAGNAMES: # Asumes there are no void elements in BLOCKED_TAGNAMES # http://www.w3.org/TR/html5/syntax.html#void-elements if not inserted_comment and element.tag_type in (HtmlTagType.OPEN_TAG, HtmlTagType.UNPAIRED_TAG): newdoc.append('<%s>' % element.tag) inserted_comment = True elif element.tag_type == HtmlTagType.CLOSE_TAG: newdoc.append('</%s>' % element.tag) inserted_comment = False elif element.tag == 'base': element.attributes = {} newdoc.append(serialize_tag(element)) else: for key, val in element.attributes.copy().items(): # Empty intrinsic events if key.startswith('on') or key == "http-equiv": element.attributes[key] = "" elif base and proxy and key == "style" and val is not None: element.attributes[key] = process_css(val, -1, base) elif element.tag in ('frame', 'iframe') and key == 'src': element.attributes[key] = '/static/frames-not-supported.html' # Rewrite javascript URIs elif key in URI_ATTRIBUTES and val is not None: if _contains_js(unescape(val)): element.attributes[key] = "#" elif base and proxy and not (element.tag == "a" and key == 'href'): element.attributes[key] = wrap_url(val, -1, base) element.attributes['_portia_%s' % key] = val elif base: element.attributes[key] = urljoin(base, val) newdoc.append(serialize_tag(element)) else: text = doc[element.start:element.end] if inserted_comment and text.strip(): newdoc.append('<!-- Removed by portia -->') else: newdoc.append(text) return ''.join(newdoc)
def insert_base_url(html, base): """ Inserts the given base url if does not exist in html source, or replace the existing if needed """ baseurl = baseelement = headelement = htmlelement = None for element in parse_html(html): if getattr(element, "tag", None) == "base": baseurl = element.attributes.get("href", None) baseelement = element elif getattr(element, "tag", None) == "head" and \ element.tag_type == HtmlTagType.OPEN_TAG: headelement = element elif getattr(element, "tag", None) == "html" and \ element.tag_type == HtmlTagType.OPEN_TAG: htmlelement = element if baseurl: if not _is_abs_url(baseurl): absurl = urlparse.urljoin(base, baseurl) # replace original base tag basetag = '<base href="%s" />' % absurl html = html[:baseelement.start] + basetag + html[baseelement.end:] else: # Generate new base element and include basetag = '<base href="%s" />' % base if headelement: insertpos = headelement.end else: if htmlelement: basetag = "\n<head>%s</head>\n" % basetag insertpos = htmlelement.end else: doctype_match = DOCTYPERE.search(html) if doctype_match: insertpos = doctype_match.end() else: insertpos = 0 html = html[:insertpos] + basetag + html[insertpos:] return html
def _test_sample(self, source, expected_parsed, samplecount=None): parsed = parse_html(source) count_element = 0 count_expected = 0 for element in parsed: if type(element) == HtmlTag: count_element += 1 expected = expected_parsed.pop(0) if type(expected) == HtmlTag: count_expected += 1 element_text = source[element.start:element.end] expected_text = source[expected.start:expected.end] if element.start != expected.start or element.end != expected.end: errstring = "[%s,%s] %s != [%s,%s] %s" % (element.start, \ element.end, element_text, expected.start, \ expected.end, expected_text) if samplecount is not None: errstring += " (sample %d)" % samplecount assert False, errstring if type(element) != type(expected): errstring = "(%s) %s != (%s) %s for text\n%s" % (count_element, \ repr(type(element)), count_expected, repr(type(expected)), element_text) if samplecount is not None: errstring += " (sample %d)" % samplecount assert False, errstring if type(element) == HtmlTag: self.assertEqual(element.tag, expected.tag) self.assertEqual(element.attributes, expected.attributes) self.assertEqual(element.tag_type, expected.tag_type) if type(element) == HtmlDataFragment: msg = "Got: %s Expected: %s in sample: %d [%d:%d] (%s)" % \ (element.is_text_content, expected.is_text_content, samplecount, element.start, element.end, repr(element_text)) \ if samplecount is not None else None self.assertEqual(element.is_text_content, expected.is_text_content, msg) if expected_parsed: errstring = "Expected %s" % repr(expected_parsed) if samplecount is not None: errstring += " (sample %d)" % samplecount assert False, errstring
def descriptify(doc): """Clean JavaScript in a html source string. """ parsed = parse_html(doc) newdoc = [] inserted_comment = False for element in parsed: if isinstance(element, HtmlTag): if not inserted_comment and element.tag == "script" and element.tag_type == HtmlTagType.OPEN_TAG: newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END) inserted_comment = True elif element.tag == "script" and element.tag_type == HtmlTagType.CLOSE_TAG: if inserted_comment: inserted_comment = False newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END) elif element.tag == "noscript": newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END) else: for key, val in element.attributes.copy().items(): # Empty intrinsic events if key in INTRINSIC_EVENT_ATTRIBUTES: element.attributes[key] = "" # Rewrite javascript URIs elif key in URI_ATTRIBUTES and val is not None and "javascript:" in _deentitize_unicode( val): element.attributes[key] = "about:blank" else: continue newdoc.append(serialize_tag(element)) else: text = doc[element.start:element.end] if inserted_comment and text.strip() and not ( text.startswith("<!--") and text.endswith("-->")): newdoc.append(_AS_COMMENT_BEGIN + text + _AS_COMMENT_END) else: newdoc.append(text) return ''.join(newdoc)
def descriptify(doc): """Clean JavaScript in a html source string. """ parsed = parse_html(doc) newdoc = [] inserted_comment = False for element in parsed: if isinstance(element, HtmlTag): if not inserted_comment and element.tag == "script" and element.tag_type == HtmlTagType.OPEN_TAG: newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END) inserted_comment = True elif element.tag == "script" and element.tag_type == HtmlTagType.CLOSE_TAG: if inserted_comment: inserted_comment = False newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END) elif element.tag == "noscript": newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END) else: for key, val in element.attributes.copy().items(): # Empty intrinsic events if key in INTRINSIC_EVENT_ATTRIBUTES: element.attributes[key] = "" # Rewrite javascript URIs elif key in URI_ATTRIBUTES and val is not None and "javascript:" in _deentitize_unicode(val): element.attributes[key] = "about:blank" else: continue newdoc.append(serialize_tag(element)) else: text = doc[element.start:element.end] if inserted_comment and text.strip() and not (text.startswith("<!--") and text.endswith("-->")): newdoc.append(_AS_COMMENT_BEGIN + text + _AS_COMMENT_END) else: newdoc.append(text) return ''.join(newdoc)
def apply(self): selector_annotations, tagid_annotations = self.split() inserts, numbered_html = defaultdict(list), self.numbered_html if selector_annotations: converted_annotations = self.apply_selector(selector_annotations) tagid_annotations += converted_annotations if not self.legacy: tagid_annotations = self.verify( [arg_to_iter(a) for a in tagid_annotations]) target = iter(parse_html(numbered_html)) output, stack = [], [] elem = next(target) last_id = 0 # XXX: A dummy element is added to the end so if the last annotation is # generated it will be added to the output filtered = defaultdict(list) for grouped in tagid_annotations: for ann in arg_to_iter(grouped): filtered[ann['tagid']].append(ann) dummy = [(1e9, [{}])] sorted_annotations = sorted([(int(k), v) for k, v in filtered.items() if k is not None]) try: for aid, annotation_data in chain(sorted_annotations, dummy): # Move target until replacement/insertion point while True: while not isinstance(elem, HtmlTag) or elem.tag == 'ins': output.append(numbered_html[elem.start:elem.end]) elem = next(target) if elem.tag_type in {OPEN_TAG, UNPAIRED_TAG}: last_id = elem.attributes.get(TAGID) stack.append(last_id) if elem.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and stack: if ('__added' not in elem.attributes and last_id is not None and aid is not None and int(last_id) < int(aid)): output.append(numbered_html[elem.start:elem.end]) elem.attributes['__added'] = True last_inserted = stack.pop() to_insert = inserts.pop(last_inserted, None) if to_insert: output.extend(to_insert) # Skip all nodes up to the next HtmlTag as these # have already been added while True: elem = next(target) try: last_id = elem.attributes.get(TAGID, last_id) except AttributeError: pass if isinstance(elem, HtmlTag): break continue if (last_id is not None and aid is not None and int(last_id) < int(aid)): if '__added' not in elem.attributes: output.append(numbered_html[elem.start:elem.end]) elem.attributes['__added'] = True elem = next(target) else: break generated = [] next_generated = [] regular_annotations = [] # Place generated annotations at the end and sort by slice for annotation in sorted(annotation_data, key=_annotation_key): if annotation.get('generated'): if annotation.get('insert_after'): next_generated.append(annotation) else: generated.append(annotation) else: regular_annotations.append(annotation) # Add annotations data as required if regular_annotations: annotation_info = self.generate(regular_annotations) for key, val in annotation_info.items(): elem.attributes[key] = val next_text_section = '' if generated: inner_data, target = tee(target) nodes = _get_inner_nodes(inner_data) next_text_section = self._get_generated( elem, generated, nodes, inserts) if next_generated: inner_data, target = tee(target) open_tags = 0 if elem.tag_type == UNPAIRED_TAG else 1 nodes = _get_inner_nodes(inner_data, open_tags=open_tags, insert_after=True) next_text_section = self._get_generated( elem, next_generated, nodes, inserts) if '__added' not in elem.attributes: output.append(serialize_tag(elem)) elem.attributes['__added'] = True # If an <ins> tag has been inserted we need to move forward if next_text_section: while True: elem = next(target) if (isinstance(elem, HtmlDataFragment) and elem.is_text_content): break output.append(numbered_html[elem.start:elem.end]) output.append(next_text_section) # Reached the end of the document except StopIteration: output.append(numbered_html[elem.start:elem.end]) else: for element in target: output.append(numbered_html[element.start:element.end]) return remove_tagids(''.join(output))
def apply_annotations(source_html, target_html): """ Applies annotations present in source_html, into raw target_html. source_html must be taggered source, target_html is the original raw (no tags, no annotations) source. """ annotations = _extract_annotations(source_html) target_page = HtmlPage(body=target_html) cleansing = _get_cleansing(target_page, annotations) numbered_html = add_tagids(target_page) target = parse_html(numbered_html) output = [] element = target.next() eof = False while not (isinstance(element, HtmlTag) and TAGID in element.attributes): output.append(numbered_html[element.start:element.end]) element = target.next() last_id = element.attributes[TAGID] for i in range(len(annotations)): annotation = annotations[i] # look up replacement/insertion point aid = _get_data_id(annotation) # move target until replacement/insertion point while int(last_id) < int(aid): output.append(numbered_html[element.start:element.end]) element = target.next() while not (isinstance(element, HtmlTag) and TAGID in element.attributes): output.append(numbered_html[element.start:element.end]) element = target.next() last_id = element.attributes[TAGID] # replace/insert in target if isinstance(annotation, HtmlTag): for key, val in annotation.attributes.items(): if key.startswith("data-scrapy-"): element.attributes[key] = val output.append(serialize_tag(element)) if not (i + 1 < len(annotations) and _get_data_id(annotations[i + 1]) == aid): element = target.next() else: # partial annotation closing_tags = _get_closing_tags(annotation) if not (i > 0 and _get_data_id(annotations[i - 1]) == aid): output.append(numbered_html[element.start:element.end]) while closing_tags > 0: element = target.next() output.append(numbered_html[element.start:element.end]) if isinstance(element, HtmlTag) and \ element.tag_type == HtmlTagType.CLOSE_TAG: closing_tags -= 1 elif (i > 0 and isinstance(annotations[i - 1], HtmlTag) and annotation[0].start > annotations[i - 1].end): element = target.next() while closing_tags > 0: output.append(numbered_html[element.start:element.end]) element = target.next() if isinstance(element, HtmlTag) and \ element.tag_type == HtmlTagType.CLOSE_TAG: closing_tags -= 1 output.append(numbered_html[element.start:element.end]) num_tags_inside = 0 partial_output = "" # computes number of tags inside a partial annotation for p in annotation: partial_output += source_html[p.start:p.end] if isinstance(p, HtmlTag): num_tags_inside += 1 if "insert-after" in p.attributes: num_tags_inside -= 2 if aid in cleansing: partial_output, fix_tag_count = _merge_code( partial_output, cleansing[aid]) num_tags_inside += fix_tag_count output.append(partial_output) element = target.next() # consume reference tag # consume the tags inside partial annotation while num_tags_inside > 0: if isinstance(element, HtmlTag): num_tags_inside -= 1 element = target.next() if not isinstance(element, HtmlTag): element = target.next() if not (i + 1 < len(annotations) and _get_data_id(annotations[i + 1]) == aid): try: while not (isinstance(element, HtmlTag) and TAGID in element.attributes): output.append(numbered_html[element.start:element.end]) element = target.next() except StopIteration: eof = True else: last_id = element.attributes[TAGID] if not eof: output.append(numbered_html[element.start:element.end]) for element in target: output.append(numbered_html[element.start:element.end]) return remove_tagids(''.join(output))
def test_ignore_xml_declaration(self): """Ignore xml declarations inside html""" parsed = list( parse_html( u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")) self.assertFalse(parsed[3].is_text_content)
def test_ignore_xml_declaration(self): """Ignore xml declarations inside html""" parsed = list(parse_html(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")) self.assertFalse(parsed[3].is_text_content)
def apply_annotations(annotations, target_page): inserts = defaultdict(list) numbered_html = add_tagids(target_page) target = parse_html(numbered_html) output, tag_stack = [], [] element = target.next() last_id = 0 # XXX: A dummy element is added to the end so if the last annotation is # generated it will be added to the output filtered = defaultdict(list) for ann in annotations: if ann and ann.get('tagid') and (ann.get('annotations') or ann.get('ignore')): filtered[ann['tagid']].append(ann) dummy = [(1e9, [{}])] sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()] + dummy) try: for aid, annotation_data in sorted_annotations: # Move target until replacement/insertion point while True: while not isinstance(element, HtmlTag): output.append(numbered_html[element.start:element.end]) element = target.next() if element.tag_type in {OPEN_TAG, UNPAIRED_TAG}: last_id = element.attributes.get(TAGID) tag_stack.append(last_id) if element.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and tag_stack: if ('__added' not in element.attributes and int(last_id) < int(aid)): output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True last_inserted = tag_stack.pop() to_insert = inserts.pop(last_inserted, None) if to_insert: output.extend(to_insert) # Skip all nodes up to the next HtmlTag as these # have already been added while True: element = target.next() try: last_id = element.attributes.get(TAGID, last_id) except AttributeError: pass if isinstance(element, HtmlTag): break continue if last_id is not None and int(last_id) < int(aid): if '__added' not in element.attributes: output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True element = target.next() else: break generated = [] next_generated = [] # Place generated annotations at the end and sort by slice for annotation in sorted(annotation_data, key=_annotation_key): if annotation.get('generated'): if annotation.get('insert_after'): next_generated.append(annotation) else: generated.append(annotation) else: # Add annotations data as required annotation_info = _gen_annotation_info(annotation) for key, val in annotation_info.items(): element.attributes[key] = val next_text_section = '' if generated: inner_data, target = tee(target) nodes = _get_inner_nodes(inner_data) next_text_section = _get_generated_annotation( element, generated, nodes, numbered_html, inserts) if next_generated: inner_data, target = tee(target) open_tags = 0 if element.tag_type == UNPAIRED_TAG else 1 nodes = _get_inner_nodes(inner_data, open_tags=open_tags, insert_after=True) next_text_section = _get_generated_annotation( element, next_generated, nodes, numbered_html, inserts) if '__added' not in element.attributes: output.append(serialize_tag(element)) element.attributes['__added'] = True # If an <ins> tag has been inserted we need to move forward if next_text_section: while True: elem = target.next() if (isinstance(elem, HtmlDataFragment) and elem.is_text_content): break output.append(numbered_html[elem.start:elem.end]) output.append(next_text_section) # Reached the end of the document except StopIteration: output.append(numbered_html[element.start:element.end]) else: for element in target: output.append(numbered_html[element.start:element.end]) return remove_tagids(''.join(output))
def apply_annotations(annotations, target_page): selector_annotations, tagid_annotations = _filter_annotations(annotations) inserts = defaultdict(list) numbered_html = add_tagids(target_page) if selector_annotations: converted_annotations = apply_selector_annotations( selector_annotations, numbered_html) tagid_annotations += converted_annotations target = iter(parse_html(numbered_html)) output, tag_stack = [], [] element = next(target) last_id = 0 # XXX: A dummy element is added to the end so if the last annotation is # generated it will be added to the output filtered = defaultdict(list) for ann in tagid_annotations: filtered[ann['tagid']].append(ann) dummy = [(1e9, [{}])] sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()] + dummy) try: for aid, annotation_data in sorted_annotations: # Move target until replacement/insertion point while True: while not isinstance(element, HtmlTag) or element.tag == 'ins': output.append(numbered_html[element.start:element.end]) element = next(target) if element.tag_type in {OPEN_TAG, UNPAIRED_TAG}: last_id = element.attributes.get(TAGID) tag_stack.append(last_id) if element.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and tag_stack: if ('__added' not in element.attributes and last_id is not None and aid is not None and int(last_id) < int(aid)): output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True last_inserted = tag_stack.pop() to_insert = inserts.pop(last_inserted, None) if to_insert: output.extend(to_insert) # Skip all nodes up to the next HtmlTag as these # have already been added while True: element = next(target) try: last_id = element.attributes.get(TAGID, last_id) except AttributeError: pass if isinstance(element, HtmlTag): break continue if (last_id is not None and aid is not None and int(last_id) < int(aid)): if '__added' not in element.attributes: output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True element = next(target) else: break generated = [] next_generated = [] # Place generated annotations at the end and sort by slice for annotation in sorted(annotation_data, key=_annotation_key): if annotation.get('generated'): if annotation.get('insert_after'): next_generated.append(annotation) else: generated.append(annotation) else: # Add annotations data as required annotation_info = _gen_annotation_info(annotation) for key, val in annotation_info.items(): element.attributes[key] = val next_text_section = '' if generated: inner_data, target = tee(target) nodes = _get_inner_nodes(inner_data) next_text_section = _get_generated_annotation( element, generated, nodes, numbered_html, inserts) if next_generated: inner_data, target = tee(target) open_tags = 0 if element.tag_type == UNPAIRED_TAG else 1 nodes = _get_inner_nodes(inner_data, open_tags=open_tags, insert_after=True) next_text_section = _get_generated_annotation( element, next_generated, nodes, numbered_html, inserts) if '__added' not in element.attributes: output.append(serialize_tag(element)) element.attributes['__added'] = True # If an <ins> tag has been inserted we need to move forward if next_text_section: while True: elem = next(target) if (isinstance(elem, HtmlDataFragment) and elem.is_text_content): break output.append(numbered_html[elem.start:elem.end]) output.append(next_text_section) # Reached the end of the document except StopIteration: output.append(numbered_html[element.start:element.end]) else: for element in target: output.append(numbered_html[element.start:element.end]) return remove_tagids(''.join(output))
def _merge_code(code1, code2): """merges two pieces of html code by text content alignment.""" parsed1 = list(parse_html(code1)) parsed2 = list(parse_html(code2)) insert_points1 = [] tags1 = [] p = 0 text1 = "" for e in parsed1: if isinstance(e, HtmlTag): insert_points1.append(p) tags1.append(e) else: p += e.end - e.start text1 += code1[e.start:e.end] insert_points2 = [] tags2 = [] p = 0 text2 = "" for e in parsed2: if isinstance(e, HtmlTag): insert_points2.append(p) tags2.append(e) else: p += e.end - e.start text2 += code2[e.start:e.end] assert(text1.startswith(text2) or text2.startswith(text1)) # unique sorted list of insert points _insert_points = sorted(insert_points1 + insert_points2) insert_points = [] for i in _insert_points: if not i in insert_points: insert_points.append(i) possible_outs = [""] start = 0 # insert tags in correct order, calculate all alternatives when # when order is ambiguous for end in insert_points: possible_outs = [out + text1[start:end] for out in possible_outs] dup_possible_outs = [out for out in possible_outs] if end in insert_points1: tag1 = tags1.pop(0) possible_outs = [out + code1[tag1.start:tag1.end] for out in possible_outs] if end in insert_points2: tag2 = tags2.pop(0) possible_outs = [out + code2[tag2.start:tag2.end] for out in possible_outs] if end in insert_points1: dup_possible_outs = [out + code2[tag2.start:tag2.end] for out in dup_possible_outs] dup_possible_outs = [out + code1[tag1.start:tag1.end] for out in dup_possible_outs] possible_outs += dup_possible_outs start = end # choose the first valid for out in possible_outs: parsed_out = list(parse_html(out)) if _order_is_valid(parsed_out): break if text1.startswith(text2): out += text1[len(text2):] else: out += text2[len(text1):] tag_count1 = sum(1 for i in parsed1 if isinstance(i, HtmlTag)) tag_count_final = sum(1 for i in parsed_out if isinstance(i, HtmlTag)) return out, tag_count_final - tag_count1