def generate_content_test(filename): filename_output_html = filename + '.metahtml.html' filename_output_text = filename + '.metahtml.text' #if os.path.isfile(filename_output_text): #return urlish = os.path.dirname(filename).split('/')[-1] urlish = urlish.replace('___', '://').replace('_', '/') with open(filename) as f: html = f.read() meta = metahtml.parse_all(html, urlish, fast=False) with open(filename_output_html, 'w') as fout: html_metahtml = meta['content']['value']['html'] fout.write(html_metahtml) with open(filename_output_text, 'w') as fout: fout.write(meta['content']['value']['text']) # generate the diff with open(filename + '.newspaper3k.html') as f: html_newspaper = f.read() with open(filename + '.diff.html', 'w') as f: html_diff = '<style> ins { background-color: #00ff00; }\ndel {background-color: #ff0000} </style>' html_diff += htmldiff(html_newspaper, html_metahtml) f.write(html_diff)
def test_basic(self): html = load_regression_data('basic-multi-page.html') urldict = self._make_basic_urldict() fetcher = urlfetch.MockUrlFetch(urldict) options = { 'url': 'http://basic.com/article.html', 'multipage': True, 'urlfetch': fetcher } doc = Document(html, **options) res = doc.summary_with_metadata() self.assertIn('Page 2', res.html, 'Should find the page 2 heading') self.assertIn('Page 3', res.html, 'Should find the page 3 heading') expected_html = load_regression_data('basic-multi-page-expected.html') diff_html = htmldiff(expected_html, res.html) diff_doc = document_fromstring(diff_html) insertions = diff_doc.xpath('//ins') deletions = diff_doc.xpath('//del') if len(insertions) != 0: for i in insertions: print('unexpected insertion: %s' % i.xpath('string()')) self.fail('readability result does not match expected') if len(deletions) != 0: for i in deletions: print('unexpected deletion: %s' % i.xpath('string()')) self.fail('readability result does not match expected')
def main(args=None): if args is None: args = sys.argv[1:] options, args = parser.parse_args(args) if options.annotation: return annotate(options, args) if len(args) != 2: print 'Error: you must give two files' parser.print_help() sys.exit(1) file1, file2 = args input1 = read_file(file1) input2 = read_file(file2) body1 = split_body(input1)[1] pre, body2, post = split_body(input2) result = htmldiff(body1, body2) result = pre + result + post if options.output == '-': if not result.endswith('\n'): result += '\n' sys.stdout.write(result) else: f = open(options.output, 'wb') f.write(result) f.close()
def detect_change(file_name: str, new_content: str) -> Optional[str]: """Detect if the content in a file differs from a given string. Args: file_name: The file that contains the string for comparison. new_content: The string that should be compared with the file contained string. Returns: The difference between the contents if there was a change, else None. """ difference_was_detected = False change = None if os.path.isfile(file_name): with open(file_name, "r") as file: old_content = file.read() if old_content != new_content: difference_was_detected = True change = htmldiff(old_content, new_content) else: difference_was_detected = True change = new_content if difference_was_detected: with open(file_name, "w") as file: file.write(new_content) return change
def post(self): v_o_id=self.request.get('v_o_id') v_t_id=self.request.get('v_t_id') title = permission(v_o_id) p = permission(v_t_id) if title == False or p == False: return version_one = self.request.get('v_o') version_two = self.request.get('v_t') r_one = models.ScriptData.get_version(v_o_id, version_one) r_two = models.ScriptData.get_version(v_t_id, version_two) v = ['s','a','c','d','p','t'] def to_html(raw_data): j = simplejson.loads(raw_data) s = StringIO.StringIO() for text, line_format in j: text = cgi.escape(text, quote=True) s.write("<p class='"+v[line_format]+"'>"+text+"</p>") return s.getvalue() s_one = to_html(r_one.data) s_two = to_html(r_two.data) content = htmldiff(s_one, s_two) self.response.headers['Content-Type']='text/html' self.response.out.write(content)
def get_context_data(self, **kwargs): context = super(DiffView, self).get_context_data(**kwargs) page = get_object_or_404(Page, pk=self.pk) public_page = self.render_page_placeholders(page.get_public_object(), self.request) draft_page = self.render_page_placeholders(page.get_draft_object(), self.request) diffs = [] for slot, public_rendered in public_page.items(): draft_rendered = draft_page.pop(slot, []) diff = htmldiff(public_rendered, draft_rendered) tree = parse_html(diff, cleanup=False) for item in tree.xpath("//ins | //del"): if len(item): continue content = item.text if not (content and content.strip()): item.getparent().remove(item) diffs.append(etree.tostring(tree, method='html')) context.update({ 'title': _('Show current changes'), 'diffs': diffs, }) return context
def show_entry_history(request, entry_id): """ Display a page with two version of the entries, compared with each other, with highlighted changes. """ if not is_contributor(request): messages.warning(request, _('This page is for contributors only.')) return redirect('index') entry = get_object_or_404(Entry, pk=entry_id) version_1 = request.GET.get('version_1', False) version_2 = request.GET.get('version_2', False) if version_2: newer = get_object_or_404(EntryVersion, pk=version_2, entry_id=entry_id) else: newer = entry.versions.last() if version_1: older = get_object_or_404(EntryVersion, pk=version_1, entry_id=entry_id) else: if entry.versions.count() > 1: older = entry.versions.filter(is_approved=True).exclude(pk=newer.pk).last() if older is None: older = entry.versions.exclude(pk=newer.pk).last() else: older = entry.versions.last() if older is None or newer is None: raise Http404 if newer.date < older.date: newer, older = older, newer def make_html(version): html = ['<article class="entry-article w3-display-container w3-border w3-card">'] for line in version.lines.all(): html.append("<h3>{}</h3>".format(line.speaker)) html.append(line.text) if version.note: html.append('<small class="footnote">Footnote: {}</small>'.format(version.note)) html.append("</article>") return "".join(html) newer_html = make_html(newer) older_html = make_html(older) html_diff = htmldiff(older_html, newer_html) return render(request, "palanaeum/staff/entry_history.html", { 'newer_version': newer, 'newer_html': newer_html, 'html_diff': html_diff, 'older_version': older, 'older_html': older_html, 'entry': entry, 'all_versions': EntryVersion.objects.filter(entry=entry), 'snippets': Snippet.all_visible.filter(entry=entry), 'images': ImageSource.all_visible.filter(entry=entry) })
def question_revisions(request, question_id): """Revision history for a Question.""" question = get_object_or_404(Question, id=question_id) revisions = list(question.revisions.all()) populate_foreign_key_caches(User, ((revisions, ('author', )), ), fields=('username', 'gravatar', 'reputation', 'gold', 'silver', 'bronze')) for i, revision in enumerate(revisions): revision.html = QUESTION_REVISION_TEMPLATE % { 'title': revision.title, 'html': sanitize_html(markdowner.convert(revision.text)), 'tags': ' '.join([ '<a class="tag">%s</a>' % tag for tag in revision.tagnames.split(' ') ]), } if i > 0: revisions[i - 1].diff = htmldiff(revision.html, revisions[i - 1].html) return render_to_response('question_revisions.html', { 'title': u'Question Revisions', 'question': question, 'revisions': revisions, }, context_instance=RequestContext(request))
def diff_html(original_content, input_file): """ Generate a html diff between two html files by adding `<ins>` and `<del>` tags. """ with codecs.open(input_file, 'r', 'utf-8') as right: right_content = right.read() content = htmldiff( original_content, right_content).encode('utf-8') soup = BeautifulSoup(content, 'lxml') # Remove link: additions for a in soup.findAll(['a']): if a.text and re.search(r'\bLink:\s.+$', a.text.encode('utf-8'), re.MULTILINE | re.UNICODE): a.string = re.sub( r'\bLink:\s.+$', u'', a.text, re.MULTILINE | re.UNICODE) # Remove empty tags for ins in soup.findAll(['ins', 'del']): if re.match(r'^\s*$', ins.text): ins.extract() result = [] for element in soup.body.contents: if hasattr(element, 'prettify'): result.append(element.prettify()) elif element and unicode(element) and not re.match(r'^\s*$', unicode(element)): result.append(unicode(element)) return ''.join(result).encode('utf-8')
def render_details(self) -> str: return Markup( clean_html( htmldiff( self.template_vars["old_value"], self.template_vars["new_value"] ) ) )
def text_diff(self): diffs = self.tree2_diffs.filter(is_texts_diff=True) if diffs.exists(): diff = diffs.first() return htmldiff( diff.treenode1.element.text.texts_html(), diff.treenode2.element.text.texts_html() ) return None
def attrs_diff(self): diffs = self.tree2_diffs.filter(is_attrs_diff=True) if diffs.exists(): diff = diffs.first() return htmldiff( diff.treenode1.element.attributes_html(), diff.treenode2.element.attributes_html() ) return None
def get_diff_of_pages(self, page1, page2): try: diff = HTMLParser.HTMLParser().unescape( htmldiff(page1.decode("utf-8"), page2.decode("utf-8")).encode("utf-8") ) diff = diff.replace("<del>", '<del style="color:red">') diff = diff.replace("<ins>", '<ins style="color:green">') return HTMLParser.HTMLParser().unescape(diff) except Exception as e: self.log("get_diff_of_pages: HTML Error " + str(e))
def compare_html(actual, expected): _actual = removed_spaces(actual) _expected = removed_spaces(expected) diff = html_unquote(htmldiff(_actual, _expected)) for i, (a, e) in enumerate(zip(_actual, _expected)): if a != e: print(i) break assert diff == _actual, _actual[:i] return diff == _actual
def get_diff_of_pages(self, page1, page2): try: diff = HTMLParser.HTMLParser().unescape( htmldiff(page1.decode('utf-8'), page2.decode('utf-8')).encode('utf-8')) diff = diff.replace('<del>', '<del style="color:red">') diff = diff.replace('<ins>', '<ins style="color:green">') return HTMLParser.HTMLParser().unescape(diff) except Exception as e: self.log('get_diff_of_pages: HTML Error ' + str(e))
def unified_diff(content1, content2): try: content1 = clean_chapter_html(content1, clean_comments_trail=True) content2 = clean_chapter_html(content2, clean_comments_trail=True) except Exception as e: logger.error('ERROR while cleaning content %s. Rev 1: %s Chapter: %s' % ( e, content1, content2)) return {"result": False} diff = htmldiff(content1, content2) return diff
def _diff_elements(old, new): """ Diff the contents of two Beatiful Soup elements. Note that this returns the "new" element with its content replaced by the diff. """ if not old or not new: return '' result_element = copy.copy(new) result_element.clear() result_element.append(htmldiff(str(old), str(new))) return result_element
def visual_diff(revision): target = repo[revision] parent = target.parents[0] tree = target.tree parent_tree = target.parents[0].tree diff = parent_tree.diff(tree) patches = list(diff) filename = get_current_name(tree, patches) name = filename[:-4] target_html = get_html_revision(name, revision, False) if len(patches) == 2: return {"patch": htmldiff(target_html, target_html)} parent_html = (get_html_revision(name, parent.hex, False) if filename in parent_tree else "") return {"patch": htmldiff(parent_html, target_html)}
def compare_pages(url1, url2, selector='body div'): basis = parse(url1).getroot() basis.make_links_absolute() other = parse(url2).getroot() other.make_links_absolute() el1 = basis.cssselect(selector)[0] el2 = other.cssselect(selector)[0] diff_content = htmldiff(tostring(el1), tostring(el2)) diff_el = fromstring(diff_content) el1.getparent().insert(el1.getparent().index(el1), diff_el) el1.getparent().remove(el1) return basis
def checkExistState(dom1,dom2): if hash(dom1) == hash(dom2): return True else: tagCount1, strippedDom1 = traverseDom(dom1) tagCount2, strippedDom2 = traverseDom(dom2) mintagCount = min(tagCount1,tagCount2) maxtagCount = max(tagCount1,tagCount2) if float(mintagCount)/float(maxtagCount) < 0.9: logger.info("Different States Huge Difference in Tag Count") return False diff1 = htmldiff(strippedDom1, strippedDom2) diff2 = htmldiff(strippedDom2,strippedDom1) if len(diff1) > len(diff2): diff = diff1 else: diff = diff2 bdiff = BeautifulSoup(diff) ins = ''.join(str(bdiff.findAll("ins"))) delete = ''.join(str(bdiff.findAll("del"))) print cleanDom(delete) diffDom = cleanDom(ins) print diffDom if diffDom!="[]": diffTagCount,diffStrippedDom = traverseDom(diffDom) else: if hash(strippedDom1) == hash(strippedDom2): return True else: return False logger.info("tag count %d %d" % (diffTagCount, tagCount1)) if (float(diffTagCount)/float(tagCount1))*100 > 5: return False logger.info("STATE ALREADY EXIST") #print dom1 #print dom2 return True
def highlight_edits(new_html: str, old_html: str) -> str: # Don't include `Edit:` text in diff. if old_html.startswith("<u>Edit:</u> "): old_html = old_html[len("<u>Edit:</u> "):] # Generate diff with lxml new_html = htmldiff(old_html, new_html) # Replace <ins> with <u> since Riot doesn't allow <ins> new_html = new_html.replace("<ins>", "<u>").replace("</ins>", "</u>") # Remove <del>s since we just want to hide deletions. new_html = re.sub("<del>.+?</del>", "", new_html) return new_html
def text_section_update_definitions_if_new(self, message: Dict): text_section = TextSection.objects.get(pk=message['text_section_pk']) old_html = fragment_fromstring(message['old_body'], create_parent='div') new_html = fragment_fromstring(text_section.body, create_parent='div') if htmldiff(old_html, new_html): logger.info( f'Found new body in text section pk={message["text_section_pk"]}' ) text_section.update_definitions()
def note_diff(note1, note2): context = { 'note': note1, 'last_note' : note2, 'note_src': note2.uid, 'http_root':settings.HTTP_ROOT } r1 = render_to_string('notesgroup/note_view.html', context) context['note'] = note2 r2 = render_to_string('notesgroup/note_view.html', context) try: content = htmldiff(r1, r2) except KeyError: content = r2 content = content.replace('class="group"', 'style="border:1px solid #CCCCCC; margin:1em 0 0; padding:0 1em;"') return content
def diff_with_previous(self, subject, page): c.breadcrumbs = [{'link': subject.url(), 'title': subject.title}, {'link': page.url(), 'title': page.title}] if page not in subject.pages: abort(404) version_id = int(request.GET['version_id']) c.version = PageVersion.get(version_id) idx = page.versions.index(c.version) c.prev_version = page.versions[idx+1] c.diff = literal(htmldiff(html_cleanup(c.prev_version.content), html_cleanup(c.version.content))) return render('page/diff_with_previous.mako')
def highlight_html_differences(s1: str, s2: str, msg_id: Optional[int]=None) -> str: retval = htmldiff(s1, s2) fragment = lxml.html.fromstring(retval) for elem in fragment.cssselect('del'): elem.tag = 'span' elem.set('class', 'highlight_text_deleted') for elem in fragment.cssselect('ins'): elem.tag = 'span' elem.set('class', 'highlight_text_inserted') retval = lxml.html.tostring(fragment) return retval
def difference2(current, old): if (current.revision == "1"): return current.html elif parse_xml(old.json_work_item_revision_text) == parse_xml( current.json_work_item_revision_text): return "<form id=err>No steps changes in the revision %s</form>" % current.revision + parse_xml( current.json_work_item_revision_text) else: old = old.html new = current.html diff_html = htmldiff(old, new) diff_html = diff_html.replace("<del>", "<del><font color=red>") diff_html = diff_html.replace("</del>", "</del></font>") diff_html = diff_html.replace("<ins>", "<ins><font color=green>") diff_html = diff_html.replace("</ins>", "</ins></font>") return diff_html
def highlight_html_differences(s1, s2, msg_id=None): # type: (str, str, Optional[int]) -> str retval = htmldiff(s1, s2) fragment = lxml.html.fromstring(retval) # type: ignore # https://github.com/python/typeshed/issues/525 for elem in fragment.cssselect('del'): elem.tag = 'span' elem.set('class', 'highlight_text_deleted') for elem in fragment.cssselect('ins'): elem.tag = 'span' elem.set('class', 'highlight_text_inserted') retval = lxml.html.tostring(fragment) # type: ignore # https://github.com/python/typeshed/issues/525 return retval
def highlight_html_differences(s1: str, s2: str, msg_id: Optional[int] = None) -> str: retval = htmldiff(s1, s2) fragment = lxml.html.fromstring(retval) for elem in fragment.cssselect('del'): elem.tag = 'span' elem.set('class', 'highlight_text_deleted') for elem in fragment.cssselect('ins'): elem.tag = 'span' elem.set('class', 'highlight_text_inserted') retval = lxml.html.tostring(fragment) return retval
def process_response(self,before_data,after_data, process_type = "injection", payloadpattern=""): diff_data = bs(str(htmldiff(after_data,before_data).split("<del>")[1:-1])) #pdb.set_trace() if process_type == "injection": if re.search(r'[sS][qQ][lL]',diff_data.text): try: print after_data.geturl() except: pass #pdb.set_trace() if payloadpattern != "": if re.search(payloadpattern,after_data): print "YESS YOU GOT IT" pdb.set_trace()
def highlight_html_differences(s1: str, s2: str, msg_id: Optional[int] = None) -> str: retval = htmldiff(s1, s2) fragment = lxml.html.fromstring(retval) for elem in fragment.cssselect("del"): elem.tag = "span" elem.set("class", "highlight_text_deleted") for elem in fragment.cssselect("ins"): elem.tag = "span" elem.set("class", "highlight_text_inserted") retval = lxml.html.tostring(fragment, encoding="unicode") return retval
def test_new_render_view(self): """ Test final html rendered by comparing with a reference file. Write the result into ./rendered/export.html """ ref_render_html = (Path(__file__).resolve().parent / 'new_ref_render.html').read_text() # this was a try to test against an another file. I won't say it was not working, but yeah, manual # verification was really more helpful assert True return self.longMessage = False self.assertEqual( ref_render_html, self.rendered_html, htmldiff(ref_render_html, self.rendered_html) ) self.longMessage = True
def extract_features(source1,source2): source1 = gzip.open(source1).read() source2 = gzip.open(source2).read() soup = BeautifulSoup(htmldiff(source1,source2),'html.parser') inserts = [] for ins in soup.find_all('ins'): ins = str(ins)[5:-6].strip() if '<' in ins and '>' in ins: if not ins.startswith('<'): ins = ins[ins.index('<'):] if not ins.endswith('>'): ins = ins[:ins.index('>')] ins = re.sub(r'\s+', '', ins) ins = re.sub(r'".*"', '""', ins) ins = re.sub(r'>.*<','><', ins) inserts.append(ins) return ' '.join(inserts)
def answer_revisions(request, answer_id): """Revision history for an Answer.""" answer = get_object_or_404(Answer, id=answer_id) revisions = list(answer.revisions.all()) populate_foreign_key_caches(User, ((revisions, ('author',)),), fields=('username', 'gravatar', 'reputation', 'gold', 'silver', 'bronze')) for i, revision in enumerate(revisions): revision.html = QUESTION_REVISION_TEMPLATE % { 'html': sanitize_html(markdowner.convert(revision.text)), } if i > 0: revisions[i - 1].diff = htmldiff(revision.html, revisions[i - 1].html) return render_to_response('answer_revisions.html', { 'title': u'Answer Revisions', 'answer': answer, 'revisions': revisions, }, context_instance=RequestContext(request))
def highlight_html_differences(s1, s2, msg_id=None): # type: (str, str, Optional[int]) -> str retval = htmldiff(s1, s2) fragment = lxml.html.fromstring( retval) # type: ignore # https://github.com/python/typeshed/issues/525 for elem in fragment.cssselect('del'): elem.tag = 'span' elem.set('class', 'highlight_text_deleted') for elem in fragment.cssselect('ins'): elem.tag = 'span' elem.set('class', 'highlight_text_inserted') retval = lxml.html.tostring( fragment ) # type: ignore # https://github.com/python/typeshed/issues/525 return retval
def get_diff(self): """Get diff between the latest to website.check_files""" hashes = [] for _check_file in [self.check_files[-2], self.check_files[-1]]: soup = _check_file.soup if self.css_selector: cont = soup.select(self.css_selector) if len(cont) > 1: sys.exit('!! selector not unique') if not cont: sys.exit(f"!! selector '{self.css_selector}' no results") cont = cont[0] else: cont = soup.html() hashes.append(str(cont)) diff = htmldiff(hashes[0], hashes[1]) return BeautifulSoup(diff, 'lxml'), diff
def check_page(url): diff = None old_version = load_previous(url) new_version = load_current(url) if new_version is None: return None if old_version != new_version and not any([old_version is None, new_version is None]): diff = htmldiff(old_version, new_version) if new_version is not None: save_version(url, new_version) elif old_version is None and new_version is not None: save_version(url, new_version) logging.info('No previous version of page found for url: {}'.format(url)) elif new_version is None: # There was an error logging.error('There was an error retrieving new version of url, see requests log: {}'.format(url)) else: logging.info("No change in page found: {}".format(url)) return diff
def answer_revisions(request, answer_id): """Revision history for an Answer.""" answer = get_object_or_404(Answer, id=answer_id) revisions = list(answer.revisions.all()) populate_foreign_key_caches(User, ((revisions, ('author', )), ), fields=('username', 'gravatar', 'reputation', 'gold', 'silver', 'bronze')) for i, revision in enumerate(revisions): revision.html = QUESTION_REVISION_TEMPLATE % { 'html': sanitize_html(markdowner.convert(revision.text)), } if i > 0: revisions[i - 1].diff = htmldiff(revision.html, revisions[i - 1].html) return render_to_response('answer_revisions.html', { 'title': u'Answer Revisions', 'answer': answer, 'revisions': revisions, }, context_instance=RequestContext(request))
def diffhtml(before_html, after_html, title=None): """Diffs the two files, and returns an html fragment that wraps differences in <ins> or <del> tags, which you can style as desired. Returns bytes, not str, because everything else works in bytes due to using requests. """ if not title: title = "Changed Agenda" # lxml.html.htmldiff only accepts strings, not bytes, but these # were read in as bytes because that's what comes from requests; # so translate them. if type(before_html) is bytes: before_html = before_html.decode() if type(after_html) is bytes: after_html = after_html.decode() # lxml.html.htmldiff returns fragments, not full documents. # So add a header that includes a style for ins and del. diff = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <html> <head> <meta http-equiv="content-type" content="text/html; charset=utf-8" /> <meta name="viewport" content="width=device-width, initial-scale=1"> <title>%s</title> <style> ins { background: #9ff; } del { background: #fbb; } </style> </head> <body> <h1>%s</h1> ''' % (title, title) diff += htmldiff(before_html, after_html) diff += "\n</body></html>\n" # encode to return bytes. return diff.encode()
def question_revisions(request, question_id): question = get_object_or_404(Question, id=question_id) revisions = list(question.revisions.all()) populate_foreign_key_caches(User, ((revisions, ('author',)),), fields=('username', 'gravatar', 'reputation', 'gold', 'silver', 'bronze')) for i, revision in enumerate(revisions): revision.html = QUESTION_REVISION_TEMPLATE % { 'title': revision.title, 'html': sanitize_html(markdowner.convert(revision.text)), 'tags': ' '.join(['<a class="tag">%s</a>' % tag for tag in revision.tagnames.split(' ')]), } if i > 0: revisions[i - 1].diff = htmldiff(revision.html, revisions[i - 1].html) return render_to_response('question_revisions.html', { 'title': u'Question Revisions', 'question': question, 'revisions': revisions, }, context_instance=RequestContext(request))
def diff_texts(text1, text2): # differ = dmp.diff_match_patch() # Remove HTML tags and duplicate \n if specified (helps to ignore diff plugin ids) if REVERSION2_DIFF_TEXT_ONLY: text1 = BeautifulSoup(text1, features="lxml").get_text() text2 = BeautifulSoup(text2, features="lxml").get_text() if REVERSION2_IGNORE_WHITESPACE: text1 = re.sub(r'\n+', '\n', text1).strip() text2 = re.sub(r'\n+', '\n', text2).strip() # diffs = differ.diff_main(text1, text2) # differ.diff_cleanupEfficiency(diffs) # # diffs = revert_escape(differ.diff_prettyHtml(diffs)) from lxml.html.diff import htmldiff diffs = htmldiff(text1, text2) return diffs
def _generate_diffs(self): self._updater.update( status='pending', description="Creating head and base website diffs.") for f in self._files: froot, fext = os.path.splitext(f) if fext not in HTML_EXTS: f = froot + '.html' f = os.path.join("_site", f) fpath, fname = os.path.split(f) head = os.path.join(self._head_dir, f) base = os.path.join(self._base_dir, f) diff = os.path.join(self._head_dir, fpath, "diff-" + fname) # if addition or deletion, just skip if not os.path.isfile(head) or not os.path.isfile(base): continue with open(base, 'r') as f: doc1 = lxml.html.parse(f) with open(head, 'r') as f: doc2 = lxml.html.parse(f) doc1body = doc1.find('body') doc2body = doc2.find('body') bodydiff = htmldiff(lxml.html.tostring(doc1body, encoding='utf-8').decode('utf-8'), lxml.html.tostring(doc2body, encoding='utf-8').decode('utf-8')) doc2head = doc2.find('head') add_stylesheet(doc2head) diffdoc = u'<html>\n{0}\n<body>\n{1}\n</body>\n</html>' diffdoc = diffdoc.format(lxml.html.tostring(doc2head, encoding='utf-8').decode('utf-8'), bodydiff) with io.open(diff, 'wb') as f: f.write(diffdoc.encode('utf-8')) print("diff'd {0!r}".format(diff))
def diff_rss(url, name, limit=-1): rss = feedparser.parse(url) links = {} #print rss if limit==-1 or limit > len(rss.entries): limit = len(rss.entries) first_index = get_first_index(rss) for i in range(first_index, limit+1): links[rss.entries[i].link] = [] post1 = rss.entries[i-1].link if i == limit: post2 = rss.entries[first_index-1].link else: post2 = rss.entries[i].link print post2 diffh = htmldiff(get_content(post1)["body"], get_content(post2)["body"]) tree = etree.parse(StringIO.StringIO(diffh), parser) diff = tree.xpath("//ins//@href") for d in diff: if urlparse(d).netloc != urlparse(rss.feed.link).netloc and urlparse(d).path != '/': links[rss.entries[i].link].append(d) return links
def difference2(test_case_id, test_case_rev): if (test_case_rev == "1"): return parse_xml(test_case_id, test_case_rev) elif parse_xml(test_case_id, str( (int(test_case_rev) - 1))) == parse_xml(test_case_id, test_case_rev): return "<form id=err>No steps changes in the revision %s</form>" % test_case_rev + parse_xml( test_case_id, test_case_rev) else: old = parse_xml(test_case_id, str((int(test_case_rev) - 1))) new = parse_xml(test_case_id, test_case_rev) diff_html = htmldiff(old, new) diff_html = diff_html.replace("<del>", "<del><font color=red>") diff_html = diff_html.replace("</del>", "</del></font>") diff_html = diff_html.replace("<ins>", "<ins><font color=green>") diff_html = diff_html.replace("</ins>", "</ins></font>") return diff_html # print(parse_xml("409770","11")) # print(get_t_c_rev("409770","32")) # print(parse_html("446114 ", "18"))
def smart_read(url): resp = urllib2.urlopen(url) #resolve url url = resp.url domain = urlparse(url).netloc path = urlparse(url).path html = resp.read() tree = etree.parse(StringIO.StringIO(html), parser) links = tree.xpath("//body//@href") nmax = 0 for link in links: if urlparse(link).netloc == domain: ng = NGram.compare(urlparse(link).path,path) #print link,ng if ng > nmax and ng < 1: nmax = ng mirror = link diffh = htmldiff(visit_page(url)["body"], visit_page(mirror)["body"]) tree = etree.parse(StringIO.StringIO(diffh), parser) diff = tree.xpath("//ins//text()") for d in diff: print d
def getHtmlDiff(dom1, dom2, tagCount1, tagCount2): diff1 = htmldiff(dom1, dom2) #diff2 = htmldiff(dom2 ,dom1) #print diff1 ''' if len(diff1) > len(diff2): diff = diff1 tagCount = tagCount1 else: diff = diff2 tagCount = tagCount2 ''' diff = diff1 tagCount = tagCount1 #diff = diff1 bdiff = BeautifulSoup(diff) ins = ''.join(str(bdiff.findAll("ins"))) print ins delete = ''.join(str(bdiff.findAll("del"))) print cleanDom(delete) diffDom = cleanDom(ins) print diffDom return diffDom
#!/usr/bin/env python from sys import argv from lxml.html.diff import htmldiff def help(): print "----------------------------------" print " An diff tool!" print "----------------------------------" print print "Usage: <file_a> <file_b>" print "Output: HTML" print print "Example output:" print " <ins>hello</ins><del>goodbye cruel</del> world" print if __name__ == "__main__": if len(argv) != 3: help() exit(1) else: file_a, file_b = argv[1:] a = open(file_a).read() b = open(file_b).read() print htmldiff(a, b)
def htmldiffer(ver_1, ver_2): content = htmldiff(ver_2.content, ver_1.content) license = htmldiff(ver_2.license, ver_1.license) title = htmldiff(ver_2.title, ver_1.title) return {'content': content, 'license':license, 'title':title}
def _build_email_body(self, mako_template_filepath: str, role: UserRoleInWorkspace, content: Content, actor: User) -> str: """ Build an email body and return it as a string :param mako_template_filepath: the absolute path to the mako template to be used for email body building :param role: the role related to user to whom the email must be sent. The role is required (and not the user only) in order to show in the mail why the user receive the notification :param content: the content item related to the notification :param actor: the user at the origin of the action / notification (for example the one who wrote a comment :param config: the global configuration :return: the built email body as string. In case of multipart email, this method must be called one time for text and one time for html """ logger.debug(self, 'Building email content from MAKO template {}'.format(mako_template_filepath)) template = Template(filename=mako_template_filepath) # TODO - D.A. - 2014-11-06 - move this # Import is here for circular import problem import tracim.lib.helpers as helpers dictified_item = Context(CTX.EMAIL_NOTIFICATION, self._global_config.WEBSITE_BASE_URL).toDict(content) dictified_actor = Context(CTX.DEFAULT).toDict(actor) main_title = dictified_item.label content_intro = '' content_text = '' call_to_action_text = '' action = content.get_last_action().id if ActionDescription.COMMENT == action: content_intro = _('<span id="content-intro-username">{}</span> added a comment:').format(actor.display_name) content_text = content.description call_to_action_text = _('Answer') elif ActionDescription.CREATION == action: # Default values (if not overriden) content_text = content.description call_to_action_text = _('View online') if ContentType.Thread == content.type: call_to_action_text = _('Answer') content_intro = _('<span id="content-intro-username">{}</span> started a thread entitled:').format(actor.display_name) content_text = '<p id="content-body-intro">{}</p>'.format(content.label) + \ content.get_last_comment_from(actor).description elif ContentType.File == content.type: content_intro = _('<span id="content-intro-username">{}</span> added a file entitled:').format(actor.display_name) if content.description: content_text = content.description else: content_text = '<span id="content-body-only-title">{}</span>'.format(content.label) elif ContentType.Page == content.type: content_intro = _('<span id="content-intro-username">{}</span> added a page entitled:').format(actor.display_name) content_text = '<span id="content-body-only-title">{}</span>'.format(content.label) elif ActionDescription.REVISION == action: content_text = content.description call_to_action_text = _('View online') if ContentType.File == content.type: content_intro = _('<span id="content-intro-username">{}</span> uploaded a new revision.').format(actor.display_name) content_text = '' elif ContentType.Page == content.type: content_intro = _('<span id="content-intro-username">{}</span> updated this page.').format(actor.display_name) previous_revision = content.get_previous_revision() title_diff = '' if previous_revision.label != content.label: title_diff = htmldiff(previous_revision.label, content.label) content_text = _('<p id="content-body-intro">Here is an overview of the changes:</p>')+ \ title_diff + \ htmldiff(previous_revision.description, content.description) elif ContentType.Thread == content.type: content_intro = _('<span id="content-intro-username">{}</span> updated the thread description.').format(actor.display_name) previous_revision = content.get_previous_revision() title_diff = '' if previous_revision.label != content.label: title_diff = htmldiff(previous_revision.label, content.label) content_text = _('<p id="content-body-intro">Here is an overview of the changes:</p>')+ \ title_diff + \ htmldiff(previous_revision.description, content.description) # elif ContentType.Thread == content.type: # content_intro = _('<span id="content-intro-username">{}</span> updated this page.').format(actor.display_name) # previous_revision = content.get_previous_revision() # content_text = _('<p id="content-body-intro">Here is an overview of the changes:</p>')+ \ # htmldiff(previous_revision.description, content.description) elif ActionDescription.EDITION == action: call_to_action_text = _('View online') if ContentType.File == content.type: content_intro = _('<span id="content-intro-username">{}</span> updated the file description.').format(actor.display_name) content_text = '<p id="content-body-intro">{}</p>'.format(content.get_label()) + \ content.description if '' == content_intro and content_text == '': # Skip notification, but it's not normal logger.error( self, 'A notification is being sent but no content. ' 'Here are some debug informations: [content_id: {cid}]' '[action: {act}][author: {actor}]'.format( cid=content.content_id, act=action, actor=actor ) ) raise ValueError('Unexpected empty notification') # Import done here because cyclic import from tracim.config.app_cfg import CFG body_content = template.render( base_url=self._global_config.WEBSITE_BASE_URL, _=_, h=helpers, user_display_name=role.user.display_name, user_role_label=role.role_as_label(), workspace_label=role.workspace.label, content_intro=content_intro, content_text=content_text, main_title=main_title, call_to_action_text=call_to_action_text, result = DictLikeClass(item=dictified_item, actor=dictified_actor), CFG=CFG.get_instance(), ) return body_content
def do_stuff(i=0): temp = glob.glob('content/*') size = len(temp)/6 # break the glob into 6 parts. 1 for each process and choose the 1/6th # associated with the process temp = temp[int(i*size):int((i+1)*size)] logging.info("Starting chunk " + str(int(i*size))) for domain in temp: # Ignore the timestamps and get the unique paths uniq_urls = set() # get the list of uniq urls in each domain for filename in glob.glob(domain+'/*'): uniq_urls.add('_'.join(filename.split('_')[:-1])) # iterate the uniq urls for path in uniq_urls: # Get the timestamps for each uniq filename timestamps = set() for filename in glob.glob(path + '*'): try: timestamps.add(float('.'.join(filename.split('_')[-1].split('.')[:-1]))) except Exception as e: pass try: timestamps = list(timestamps) # this needs to be done in order. timestamps.sort() #more parsing temp1 = path + '_' + str(timestamps[0]) + '.gz' # interate the copies in order for i in timestamps[1:]: # construct the filename temp2 = path + '_' + str(i) + '.gz' # construct the output file name filename = "diffs/"+path+'_'+str(i)+'_diff.gz' # check make sure the output folder exists dir = os.path.dirname(filename) if not os.path.exists(dir): os.makedirs(dir) # Skip the ones that have already been diffed if not os.path.isfile(filename): # Open the output file with open(filename,'w') as f: try: try: # this is the sauce. All of the diffing and writing happens in this line. f.write(htmldiff(gzip.open(temp1).read(),gzip.open(temp2).read())) logging.debug("successful write "+filename) except Exception: # sometimes it doesn't like the sauce so I created verde. f.write(unicode(htmldiff(gzip.open(temp1).read().decode('utf-8','ignore'),gzip.open(temp2).read().decode('utf-8','ignore'))).encode('utf-8','ignore')) logging.debug("transcode write "+filename) except IOError as e: logging.debug("missing file " + filename) except AssertionError as e: logging.debug("Assertion Error " + filename + " : " + e) else: logging.debug("already exists " + filename) temp1 = temp2 except TypeError as e: print e
def show_diff(old_diff, new_diff): return htmldiff(old_diff, new_diff)
def value(self): return htmldiff(self.value1, self.value2)
def getDomDiff(parentDom, childDom): html = htmldiff(parentDom, childDom) bshtml = BeautifulSoup(html) ins = ''.join(str(bshtml.findAll("ins"))) diffDom = cleanDom(ins) return diffDom
new_contents = site.get_page(subpath, rev=new) except sven.ResourceUnchanged, e: new = e.last_change resource_unchanged = True except sven.NotAFile: return redirect(site.directory_index_url(subpath)) except sven.NoSuchResource: return redirect(site.history_url(subpath)) except sven.FutureRevision: return redirect(site.history_url(subpath)) if resource_unchanged: return redirect(site.page_diff_url(subpath) + "?versions=%s,%s" % (old, new)) # @@todo: raw diff? binary diff? contents = htmldiff(old_contents, new_contents) mimetype = mimetypes.guess_type(subpath)[0] return dict(site=site, contents=contents, mimetype=mimetype, path=subpath) @requires("WIKI_EDIT") @allow_http("GET", "POST") @rendered_with("sites/site/page-create.html") def page_create(request, subpath): site = request.site if request.method == "POST": path = request.POST['path'] # @@todo: don't slugify for raw wikis? dunno from django.template.defaultfilters import slugify path = '/'.join(slugify(i) for i in path.split('/'))
def _diff_html(left, right): return htmldiff(left, right)