Python htmldiffの例、lxml.html.diff.htmldiff Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_content.py プロジェクト: maxinebaghdadi/metahtml

def generate_content_test(filename):
    filename_output_html = filename + '.metahtml.html'
    filename_output_text = filename + '.metahtml.text'

    #if os.path.isfile(filename_output_text):
    #return

    urlish = os.path.dirname(filename).split('/')[-1]
    urlish = urlish.replace('___', '://').replace('_', '/')

    with open(filename) as f:
        html = f.read()
        meta = metahtml.parse_all(html, urlish, fast=False)
        with open(filename_output_html, 'w') as fout:
            html_metahtml = meta['content']['value']['html']
            fout.write(html_metahtml)
        with open(filename_output_text, 'w') as fout:
            fout.write(meta['content']['value']['text'])

    # generate the diff
    with open(filename + '.newspaper3k.html') as f:
        html_newspaper = f.read()

    with open(filename + '.diff.html', 'w') as f:
        html_diff = '<style> ins { background-color: #00ff00; }\ndel {background-color: #ff0000} </style>'
        html_diff += htmldiff(html_newspaper, html_metahtml)
        f.write(html_diff)

コード例 #2

0

ファイルを表示

ファイル: test_readability.py プロジェクト: ZoeyYoung/python-readability

    def test_basic(self):
        html = load_regression_data('basic-multi-page.html')
        urldict = self._make_basic_urldict()
        fetcher = urlfetch.MockUrlFetch(urldict)
        options = {
                'url': 'http://basic.com/article.html',
                'multipage': True,
                'urlfetch': fetcher
                }
        doc = Document(html, **options)
        res = doc.summary_with_metadata()

        self.assertIn('Page 2', res.html, 'Should find the page 2 heading')
        self.assertIn('Page 3', res.html, 'Should find the page 3 heading')

        expected_html = load_regression_data('basic-multi-page-expected.html')
        diff_html = htmldiff(expected_html, res.html)
        diff_doc = document_fromstring(diff_html)

        insertions = diff_doc.xpath('//ins')
        deletions = diff_doc.xpath('//del')

        if len(insertions) != 0:
            for i in insertions:
                print('unexpected insertion: %s' % i.xpath('string()'))
            self.fail('readability result does not match expected')

        if len(deletions) != 0:
            for i in deletions:
                print('unexpected deletion: %s' % i.xpath('string()'))
            self.fail('readability result does not match expected')

コード例 #3

0

ファイルを表示

ファイル: _diffcommand.py プロジェクト: 15580056814/hue

def main(args=None):
    if args is None:
        args = sys.argv[1:]
    options, args = parser.parse_args(args)
    if options.annotation:
        return annotate(options, args)
    if len(args) != 2:
        print 'Error: you must give two files'
        parser.print_help()
        sys.exit(1)
    file1, file2 = args
    input1 = read_file(file1)
    input2 = read_file(file2)
    body1 = split_body(input1)[1]
    pre, body2, post = split_body(input2)
    result = htmldiff(body1, body2)
    result = pre + result + post
    if options.output == '-':
        if not result.endswith('\n'):
            result += '\n'
        sys.stdout.write(result)
    else:
        f = open(options.output, 'wb')
        f.write(result)
        f.close()

コード例 #4

0

ファイルを表示

def detect_change(file_name: str, new_content: str) -> Optional[str]:
    """Detect if the content in a file differs from a given string.

    Args:
        file_name: The file that contains the string for comparison.
        new_content: The string that should be compared with the file contained string.

    Returns:
        The difference between the contents if there was a change, else None.
    """
    difference_was_detected = False
    change = None
    if os.path.isfile(file_name):
        with open(file_name, "r") as file:
            old_content = file.read()
            if old_content != new_content:
                difference_was_detected = True
                change = htmldiff(old_content, new_content)
    else:
        difference_was_detected = True
        change = new_content

    if difference_was_detected:
        with open(file_name, "w") as file:
            file.write(new_content)
    return change

コード例 #5

0

ファイルを表示

ファイル: _diffcommand.py プロジェクト: kumarremoa/Goferbot

def main(args=None):
    if args is None:
        args = sys.argv[1:]
    options, args = parser.parse_args(args)
    if options.annotation:
        return annotate(options, args)
    if len(args) != 2:
        print 'Error: you must give two files'
        parser.print_help()
        sys.exit(1)
    file1, file2 = args
    input1 = read_file(file1)
    input2 = read_file(file2)
    body1 = split_body(input1)[1]
    pre, body2, post = split_body(input2)
    result = htmldiff(body1, body2)
    result = pre + result + post
    if options.output == '-':
        if not result.endswith('\n'):
            result += '\n'
        sys.stdout.write(result)
    else:
        f = open(options.output, 'wb')
        f.write(result)
        f.close()

コード例 #6

0

ファイルを表示

	def post(self):
		v_o_id=self.request.get('v_o_id')
		v_t_id=self.request.get('v_t_id')
		title = permission(v_o_id)
		p = permission(v_t_id)
		if title == False or p == False:
			return
		version_one = self.request.get('v_o')
		version_two = self.request.get('v_t')
		r_one = models.ScriptData.get_version(v_o_id, version_one)
		r_two = models.ScriptData.get_version(v_t_id, version_two)
		v = ['s','a','c','d','p','t']

		def to_html(raw_data):
			j = simplejson.loads(raw_data)
			s = StringIO.StringIO()
			for text, line_format in j:
				text = cgi.escape(text, quote=True)
				s.write("<p class='"+v[line_format]+"'>"+text+"</p>")
			return s.getvalue()

		s_one = to_html(r_one.data)
		s_two = to_html(r_two.data)
		content = htmldiff(s_one, s_two)
		self.response.headers['Content-Type']='text/html'
		self.response.out.write(content)

コード例 #7

0

ファイルを表示

ファイル: views.py プロジェクト: karalics/djangocms-workflows

    def get_context_data(self, **kwargs):
        context = super(DiffView, self).get_context_data(**kwargs)

        page = get_object_or_404(Page, pk=self.pk)
        public_page = self.render_page_placeholders(page.get_public_object(),
                                                    self.request)
        draft_page = self.render_page_placeholders(page.get_draft_object(),
                                                   self.request)

        diffs = []
        for slot, public_rendered in public_page.items():
            draft_rendered = draft_page.pop(slot, [])

            diff = htmldiff(public_rendered, draft_rendered)
            tree = parse_html(diff, cleanup=False)

            for item in tree.xpath("//ins | //del"):
                if len(item):
                    continue

                content = item.text
                if not (content and content.strip()):
                    item.getparent().remove(item)

            diffs.append(etree.tostring(tree, method='html'))

        context.update({
            'title': _('Show current changes'),
            'diffs': diffs,
        })

        return context

コード例 #8

0

ファイルを表示

def show_entry_history(request, entry_id):
    """
    Display a page with two version of the entries, compared with each other, with highlighted changes.
    """
    if not is_contributor(request):
        messages.warning(request, _('This page is for contributors only.'))
        return redirect('index')

    entry = get_object_or_404(Entry, pk=entry_id)
    version_1 = request.GET.get('version_1', False)
    version_2 = request.GET.get('version_2', False)

    if version_2:
        newer = get_object_or_404(EntryVersion, pk=version_2, entry_id=entry_id)
    else:
        newer = entry.versions.last()

    if version_1:
        older = get_object_or_404(EntryVersion, pk=version_1, entry_id=entry_id)
    else:
        if entry.versions.count() > 1:
            older = entry.versions.filter(is_approved=True).exclude(pk=newer.pk).last()
            if older is None:
                older = entry.versions.exclude(pk=newer.pk).last()
        else:
            older = entry.versions.last()

    if older is None or newer is None:
        raise Http404

    if newer.date < older.date:
        newer, older = older, newer

    def make_html(version):
        html = ['<article class="entry-article w3-display-container w3-border w3-card">']
        for line in version.lines.all():
            html.append("<h3>{}</h3>".format(line.speaker))
            html.append(line.text)
        if version.note:
            html.append('<small class="footnote">Footnote: {}</small>'.format(version.note))

        html.append("</article>")
        return "".join(html)

    newer_html = make_html(newer)
    older_html = make_html(older)

    html_diff = htmldiff(older_html, newer_html)

    return render(request, "palanaeum/staff/entry_history.html", {
        'newer_version': newer,
        'newer_html': newer_html,
        'html_diff': html_diff,
        'older_version': older,
        'older_html': older_html,
        'entry': entry,
        'all_versions': EntryVersion.objects.filter(entry=entry),
        'snippets': Snippet.all_visible.filter(entry=entry),
        'images': ImageSource.all_visible.filter(entry=entry)
    })

コード例 #9

0

ファイルを表示

ファイル: views.py プロジェクト: vishbin/soclone

def question_revisions(request, question_id):
    """Revision history for a Question."""
    question = get_object_or_404(Question, id=question_id)
    revisions = list(question.revisions.all())
    populate_foreign_key_caches(User, ((revisions, ('author', )), ),
                                fields=('username', 'gravatar', 'reputation',
                                        'gold', 'silver', 'bronze'))
    for i, revision in enumerate(revisions):
        revision.html = QUESTION_REVISION_TEMPLATE % {
            'title':
            revision.title,
            'html':
            sanitize_html(markdowner.convert(revision.text)),
            'tags':
            ' '.join([
                '<a class="tag">%s</a>' % tag
                for tag in revision.tagnames.split(' ')
            ]),
        }
        if i > 0:
            revisions[i - 1].diff = htmldiff(revision.html,
                                             revisions[i - 1].html)
    return render_to_response('question_revisions.html', {
        'title': u'Question Revisions',
        'question': question,
        'revisions': revisions,
    },
                              context_instance=RequestContext(request))

コード例 #10

0

ファイルを表示

ファイル: diff.py プロジェクト: mattbierner/post-mortem

def diff_html(original_content, input_file):
    """
    Generate a html diff between two html files by adding
    `<ins>` and `<del>` tags.
    """
    with codecs.open(input_file, 'r', 'utf-8') as right:
        right_content = right.read()

    content = htmldiff(
        original_content,
        right_content).encode('utf-8')

    soup = BeautifulSoup(content, 'lxml')

    # Remove link: additions
    for a in soup.findAll(['a']):
        if a.text and re.search(r'\bLink:\s.+$', a.text.encode('utf-8'), re.MULTILINE | re.UNICODE):
            a.string = re.sub(
                r'\bLink:\s.+$', u'', a.text, re.MULTILINE | re.UNICODE)

    # Remove empty tags
    for ins in soup.findAll(['ins', 'del']):
        if re.match(r'^\s*$', ins.text):
            ins.extract()

    result = []
    for element in soup.body.contents:
        if hasattr(element, 'prettify'):
            result.append(element.prettify())
        elif element and unicode(element) and not re.match(r'^\s*$', unicode(element)):
            result.append(unicode(element))

    return ''.join(result).encode('utf-8')

コード例 #11

0

ファイルを表示

ファイル: article.py プロジェクト: betagouv/zam

 def render_details(self) -> str:
     return Markup(
         clean_html(
             htmldiff(
                 self.template_vars["old_value"], self.template_vars["new_value"]
             )
         )
     )

コード例 #12

0

ファイルを表示

ファイル: models.py プロジェクト: classifaddict/classifactory

 def text_diff(self):
     diffs = self.tree2_diffs.filter(is_texts_diff=True)
     if diffs.exists():
         diff = diffs.first()
         return htmldiff(
             diff.treenode1.element.text.texts_html(),
             diff.treenode2.element.text.texts_html()
         )
     return None

コード例 #13

0

ファイルを表示

ファイル: models.py プロジェクト: classifaddict/classifactory

 def attrs_diff(self):
     diffs = self.tree2_diffs.filter(is_attrs_diff=True)
     if diffs.exists():
         diff = diffs.first()
         return htmldiff(
             diff.treenode1.element.attributes_html(),
             diff.treenode2.element.attributes_html()
         )
     return None

コード例 #14

0

ファイルを表示

ファイル: fsin_parser.py プロジェクト: filaPro/fsin_parser

 def get_diff_of_pages(self, page1, page2):
     try:
         diff = HTMLParser.HTMLParser().unescape(
             htmldiff(page1.decode("utf-8"), page2.decode("utf-8")).encode("utf-8")
         )
         diff = diff.replace("<del>", '<del style="color:red">')
         diff = diff.replace("<ins>", '<ins style="color:green">')
         return HTMLParser.HTMLParser().unescape(diff)
     except Exception as e:
         self.log("get_diff_of_pages: HTML Error " + str(e))

コード例 #15

0

ファイルを表示

ファイル: helper.py プロジェクト: heejongahn/dodotable

def compare_html(actual, expected):
    _actual = removed_spaces(actual)
    _expected = removed_spaces(expected)
    diff = html_unquote(htmldiff(_actual, _expected))
    for i, (a, e) in enumerate(zip(_actual, _expected)):
        if a != e:
            print(i)
            break
    assert diff == _actual, _actual[:i]
    return diff == _actual

コード例 #16

0

ファイルを表示

ファイル: fsin_parser.py プロジェクト: alexgavrikov/fsin_parser

 def get_diff_of_pages(self, page1, page2):
     try:
         diff = HTMLParser.HTMLParser().unescape(
             htmldiff(page1.decode('utf-8'),
                      page2.decode('utf-8')).encode('utf-8'))
         diff = diff.replace('<del>', '<del style="color:red">')
         diff = diff.replace('<ins>', '<ins style="color:green">')
         return HTMLParser.HTMLParser().unescape(diff)
     except Exception as e:
         self.log('get_diff_of_pages: HTML Error ' + str(e))

コード例 #17

0

ファイルを表示

ファイル: views.py プロジェクト: danielhjames/Booktype

def unified_diff(content1, content2):
    try:
        content1 = clean_chapter_html(content1, clean_comments_trail=True)
        content2 = clean_chapter_html(content2, clean_comments_trail=True)
    except Exception as e:
        logger.error('ERROR while cleaning content %s. Rev 1: %s Chapter: %s' % (
            e, content1, content2))
        return {"result": False}

    diff = htmldiff(content1, content2)
    return diff

コード例 #18

0

ファイルを表示

ファイル: differs.py プロジェクト: mgrove/web-monitoring-processing

def _diff_elements(old, new):
    """
    Diff the contents of two Beatiful Soup elements. Note that this returns
    the "new" element with its content replaced by the diff.
    """
    if not old or not new:
        return ''
    result_element = copy.copy(new)
    result_element.clear()
    result_element.append(htmldiff(str(old), str(new)))
    return result_element

コード例 #19

0

ファイルを表示

ファイル: wiki.py プロジェクト: B-Rich/wiki

def visual_diff(revision):
    target = repo[revision]
    parent = target.parents[0]

    tree = target.tree
    parent_tree = target.parents[0].tree

    diff = parent_tree.diff(tree)
    patches = list(diff)
    filename = get_current_name(tree, patches)
    name = filename[:-4]
    target_html = get_html_revision(name, revision, False)

    if len(patches) == 2:
        return {"patch": htmldiff(target_html, target_html)}

    parent_html = (get_html_revision(name, parent.hex, False)
                   if filename in parent_tree else "")

    return {"patch": htmldiff(parent_html, target_html)}

コード例 #20

0

ファイルを表示

def unified_diff(content1, content2):
    try:
        content1 = clean_chapter_html(content1, clean_comments_trail=True)
        content2 = clean_chapter_html(content2, clean_comments_trail=True)
    except Exception as e:
        logger.error('ERROR while cleaning content %s. Rev 1: %s Chapter: %s' % (
            e, content1, content2))
        return {"result": False}

    diff = htmldiff(content1, content2)
    return diff

コード例 #21

0

ファイルを表示

ファイル: lxmldiff.py プロジェクト: ianb/misc-recipes

def compare_pages(url1, url2, selector='body div'):
    basis = parse(url1).getroot()
    basis.make_links_absolute()
    other = parse(url2).getroot()
    other.make_links_absolute()
    el1 = basis.cssselect(selector)[0]
    el2 = other.cssselect(selector)[0]
    diff_content = htmldiff(tostring(el1), tostring(el2))
    diff_el = fromstring(diff_content)
    el1.getparent().insert(el1.getparent().index(el1), diff_el)
    el1.getparent().remove(el1)
    return basis

コード例 #22

0

ファイルを表示

ファイル: DomComparator.py プロジェクト: dip-kush/CrawlerUI

def checkExistState(dom1,dom2):
    if hash(dom1) == hash(dom2):
        return True
    else:
        tagCount1, strippedDom1 = traverseDom(dom1)
        tagCount2, strippedDom2 = traverseDom(dom2)
        
        mintagCount = min(tagCount1,tagCount2)
        maxtagCount = max(tagCount1,tagCount2)
        if float(mintagCount)/float(maxtagCount) < 0.9:
            logger.info("Different States Huge Difference in Tag Count")
            return False
        diff1 = htmldiff(strippedDom1, strippedDom2)
        diff2 = htmldiff(strippedDom2,strippedDom1)
        if len(diff1) > len(diff2):
            diff = diff1
        else:
            diff = diff2

        bdiff = BeautifulSoup(diff)
        ins = ''.join(str(bdiff.findAll("ins")))
        delete = ''.join(str(bdiff.findAll("del")))
        print cleanDom(delete)
        diffDom = cleanDom(ins)
        print diffDom

        if diffDom!="[]":
            diffTagCount,diffStrippedDom = traverseDom(diffDom)
        else:
            if hash(strippedDom1) ==  hash(strippedDom2):
                return True
            else:
                return False
        logger.info("tag count %d %d" % (diffTagCount, tagCount1))
        if (float(diffTagCount)/float(tagCount1))*100 > 5:
            return False
        logger.info("STATE ALREADY EXIST")
        #print dom1
        #print dom2
        return True

コード例 #23

0

ファイルを表示

def highlight_edits(new_html: str, old_html: str) -> str:
    # Don't include `Edit:` text in diff.
    if old_html.startswith("<u>Edit:</u> "):
        old_html = old_html[len("<u>Edit:</u> "):]

    # Generate diff with lxml
    new_html = htmldiff(old_html, new_html)

    # Replace <ins> with <u> since Riot doesn't allow <ins>
    new_html = new_html.replace("<ins>", "<u>").replace("</ins>", "</u>")
    # Remove <del>s since we just want to hide deletions.
    new_html = re.sub("<del>.+?</del>", "", new_html)
    return new_html

コード例 #24

0

ファイルを表示

    def text_section_update_definitions_if_new(self, message: Dict):
        text_section = TextSection.objects.get(pk=message['text_section_pk'])

        old_html = fragment_fromstring(message['old_body'],
                                       create_parent='div')
        new_html = fragment_fromstring(text_section.body, create_parent='div')

        if htmldiff(old_html, new_html):
            logger.info(
                f'Found new body in text section pk={message["text_section_pk"]}'
            )

            text_section.update_definitions()

コード例 #25

0

ファイルを表示

ファイル: utils.py プロジェクト: pimentech/notesgroup

def note_diff(note1, note2):
    context = { 'note': note1, 
                'last_note' : note2,
                'note_src': note2.uid, 
                'http_root':settings.HTTP_ROOT }
    r1 = render_to_string('notesgroup/note_view.html', context)
    context['note'] = note2
    r2 = render_to_string('notesgroup/note_view.html', context)
    try:
        content = htmldiff(r1, r2)
    except KeyError:
        content = r2
    content = content.replace('class="group"', 'style="border:1px solid #CCCCCC; margin:1em 0 0; padding:0 1em;"')
    return content

コード例 #26

0

ファイルを表示

ファイル: subjectpage.py プロジェクト: nous-consulting/ututi

 def diff_with_previous(self, subject, page):
     c.breadcrumbs = [{'link': subject.url(),
                       'title': subject.title},
                      {'link': page.url(),
                       'title': page.title}]
     if page not in subject.pages:
         abort(404)
     version_id = int(request.GET['version_id'])
     c.version = PageVersion.get(version_id)
     idx = page.versions.index(c.version)
     c.prev_version = page.versions[idx+1]
     c.diff = literal(htmldiff(html_cleanup(c.prev_version.content),
                               html_cleanup(c.version.content)))
     return render('page/diff_with_previous.mako')

コード例 #27

0

ファイルを表示

ファイル: html_diff.py プロジェクト: 284928489/zulip

def highlight_html_differences(s1: str, s2: str, msg_id: Optional[int]=None) -> str:
    retval = htmldiff(s1, s2)
    fragment = lxml.html.fromstring(retval)

    for elem in fragment.cssselect('del'):
        elem.tag = 'span'
        elem.set('class', 'highlight_text_deleted')

    for elem in fragment.cssselect('ins'):
        elem.tag = 'span'
        elem.set('class', 'highlight_text_inserted')

    retval = lxml.html.tostring(fragment)

    return retval

コード例 #28

0

ファイルを表示

def difference2(current, old):
    if (current.revision == "1"):
        return current.html
    elif parse_xml(old.json_work_item_revision_text) == parse_xml(
            current.json_work_item_revision_text):
        return "<form id=err>No steps changes in the revision %s</form>" % current.revision + parse_xml(
            current.json_work_item_revision_text)
    else:
        old = old.html
        new = current.html
        diff_html = htmldiff(old, new)
        diff_html = diff_html.replace("<del>", "<del><font color=red>")
        diff_html = diff_html.replace("</del>", "</del></font>")
        diff_html = diff_html.replace("<ins>", "<ins><font color=green>")
        diff_html = diff_html.replace("</ins>", "</ins></font>")
        return diff_html

コード例 #29

0

ファイルを表示

ファイル: html_diff.py プロジェクト: brockwhittaker/zulip

def highlight_html_differences(s1, s2, msg_id=None):
    # type: (str, str, Optional[int]) -> str
    retval = htmldiff(s1, s2)
    fragment = lxml.html.fromstring(retval)  # type: ignore # https://github.com/python/typeshed/issues/525

    for elem in fragment.cssselect('del'):
        elem.tag = 'span'
        elem.set('class', 'highlight_text_deleted')

    for elem in fragment.cssselect('ins'):
        elem.tag = 'span'
        elem.set('class', 'highlight_text_inserted')

    retval = lxml.html.tostring(fragment)   # type: ignore # https://github.com/python/typeshed/issues/525

    return retval

コード例 #30

0

ファイルを表示

def highlight_html_differences(s1: str,
                               s2: str,
                               msg_id: Optional[int] = None) -> str:
    retval = htmldiff(s1, s2)
    fragment = lxml.html.fromstring(retval)

    for elem in fragment.cssselect('del'):
        elem.tag = 'span'
        elem.set('class', 'highlight_text_deleted')

    for elem in fragment.cssselect('ins'):
        elem.tag = 'span'
        elem.set('class', 'highlight_text_inserted')

    retval = lxml.html.tostring(fragment)

    return retval

コード例 #31

0

ファイルを表示

    def process_response(self,before_data,after_data, process_type = "injection", payloadpattern=""):
        diff_data = bs(str(htmldiff(after_data,before_data).split("<del>")[1:-1]))

        #pdb.set_trace()

        if process_type == "injection":
            if re.search(r'[sS][qQ][lL]',diff_data.text):
                try:
                    print after_data.geturl()
                except:
                    pass
                #pdb.set_trace()

            if payloadpattern != "":
                if re.search(payloadpattern,after_data):
                    print "YESS YOU GOT IT"
                    pdb.set_trace()

コード例 #32

0

ファイルを表示

def highlight_html_differences(s1: str,
                               s2: str,
                               msg_id: Optional[int] = None) -> str:
    retval = htmldiff(s1, s2)
    fragment = lxml.html.fromstring(retval)

    for elem in fragment.cssselect("del"):
        elem.tag = "span"
        elem.set("class", "highlight_text_deleted")

    for elem in fragment.cssselect("ins"):
        elem.tag = "span"
        elem.set("class", "highlight_text_inserted")

    retval = lxml.html.tostring(fragment, encoding="unicode")

    return retval

コード例 #33

0

ファイルを表示

    def test_new_render_view(self):
        """
        Test final html rendered by comparing with a reference file.
        Write the result into ./rendered/export.html
        """
        ref_render_html = (Path(__file__).resolve().parent / 'new_ref_render.html').read_text()

        # this was a try to test against an another file. I won't say it was not working, but yeah, manual
        # verification was really more helpful
        assert True
        return
        self.longMessage = False
        self.assertEqual(
            ref_render_html,
            self.rendered_html,
            htmldiff(ref_render_html, self.rendered_html)
        )
        self.longMessage = True

コード例 #34

0

ファイルを表示

ファイル: learn_mal.py プロジェクト: skandaloptagon/AL_EXAmine

def extract_features(source1,source2):
    source1 = gzip.open(source1).read()
    source2 = gzip.open(source2).read()
    soup = BeautifulSoup(htmldiff(source1,source2),'html.parser')
    inserts = []
    for ins in soup.find_all('ins'):
        ins = str(ins)[5:-6].strip() 
        if '<' in ins and '>' in ins:
            if not ins.startswith('<'):
                ins = ins[ins.index('<'):]
            if not ins.endswith('>'):
                ins = ins[:ins.index('>')]

            ins = re.sub(r'\s+', '', ins)
            ins = re.sub(r'".*"', '""', ins)
            ins = re.sub(r'>.*<','><', ins)
            inserts.append(ins)

    return ' '.join(inserts)

コード例 #35

0

ファイルを表示

ファイル: views.py プロジェクト: VineGlobal/soclone

def answer_revisions(request, answer_id):
    """Revision history for an Answer."""
    answer = get_object_or_404(Answer, id=answer_id)
    revisions = list(answer.revisions.all())
    populate_foreign_key_caches(User, ((revisions, ('author',)),),
         fields=('username', 'gravatar', 'reputation', 'gold', 'silver',
                 'bronze'))
    for i, revision in enumerate(revisions):
        revision.html = QUESTION_REVISION_TEMPLATE % {
            'html': sanitize_html(markdowner.convert(revision.text)),
        }
        if i > 0:
            revisions[i - 1].diff = htmldiff(revision.html,
                                             revisions[i - 1].html)
    return render_to_response('answer_revisions.html', {
        'title': u'Answer Revisions',
        'answer': answer,
        'revisions': revisions,
    }, context_instance=RequestContext(request))

コード例 #36

0

ファイルを表示

def highlight_html_differences(s1, s2, msg_id=None):
    # type: (str, str, Optional[int]) -> str
    retval = htmldiff(s1, s2)
    fragment = lxml.html.fromstring(
        retval)  # type: ignore # https://github.com/python/typeshed/issues/525

    for elem in fragment.cssselect('del'):
        elem.tag = 'span'
        elem.set('class', 'highlight_text_deleted')

    for elem in fragment.cssselect('ins'):
        elem.tag = 'span'
        elem.set('class', 'highlight_text_inserted')

    retval = lxml.html.tostring(
        fragment
    )  # type: ignore # https://github.com/python/typeshed/issues/525

    return retval

コード例 #37

0

ファイルを表示

    def get_diff(self):
        """Get diff between the latest to website.check_files"""
        hashes = []
        for _check_file in [self.check_files[-2], self.check_files[-1]]:
            soup = _check_file.soup

            if self.css_selector:
                cont = soup.select(self.css_selector)
                if len(cont) > 1:
                    sys.exit('!! selector not unique')
                if not cont:
                    sys.exit(f"!! selector '{self.css_selector}' no results")
                cont = cont[0]
            else:
                cont = soup.html()

            hashes.append(str(cont))

        diff = htmldiff(hashes[0], hashes[1])
        return BeautifulSoup(diff, 'lxml'), diff

コード例 #38

0

ファイルを表示

ファイル: reservation_monitor.py プロジェクト: boldfield/puppy-reservation-monitor

def check_page(url):
    diff = None
    old_version = load_previous(url)
    new_version = load_current(url)
    if new_version is None:
        return None

    if old_version != new_version and not any([old_version is None, new_version is None]):
        diff = htmldiff(old_version, new_version)
        if new_version is not None:
            save_version(url, new_version)
    elif old_version is None and new_version is not None:
        save_version(url, new_version)
        logging.info('No previous version of page found for url: {}'.format(url))
    elif new_version is None:
        # There was an error
        logging.error('There was an error retrieving new version of url, see requests log: {}'.format(url))
    else:
        logging.info("No change in page found: {}".format(url))
    return diff

コード例 #39

0

ファイルを表示

ファイル: views.py プロジェクト: vishbin/soclone

def answer_revisions(request, answer_id):
    """Revision history for an Answer."""
    answer = get_object_or_404(Answer, id=answer_id)
    revisions = list(answer.revisions.all())
    populate_foreign_key_caches(User, ((revisions, ('author', )), ),
                                fields=('username', 'gravatar', 'reputation',
                                        'gold', 'silver', 'bronze'))
    for i, revision in enumerate(revisions):
        revision.html = QUESTION_REVISION_TEMPLATE % {
            'html': sanitize_html(markdowner.convert(revision.text)),
        }
        if i > 0:
            revisions[i - 1].diff = htmldiff(revision.html,
                                             revisions[i - 1].html)
    return render_to_response('answer_revisions.html', {
        'title': u'Answer Revisions',
        'answer': answer,
        'revisions': revisions,
    },
                              context_instance=RequestContext(request))

コード例 #40

0

ファイルを表示

ファイル: losalamosmtgs.py プロジェクト: sindependent/scripts

def diffhtml(before_html, after_html, title=None):
    """Diffs the two files, and returns an html fragment that wraps
       differences in <ins> or <del> tags, which you can style as desired.
       Returns bytes, not str, because everything else works in bytes
       due to using requests.
    """
    if not title:
        title = "Changed Agenda"

    # lxml.html.htmldiff only accepts strings, not bytes, but these
    # were read in as bytes because that's what comes from requests;
    # so translate them.
    if type(before_html) is bytes:
        before_html = before_html.decode()
    if type(after_html) is bytes:
        after_html = after_html.decode()

    # lxml.html.htmldiff returns fragments, not full documents.
    # So add a header that includes a style for ins and del.
    diff = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>%s</title>
<style>
ins { background: #9ff; }
del { background: #fbb; }
</style>
</head>

<body>
<h1>%s</h1>
''' % (title, title)

    diff += htmldiff(before_html, after_html)

    diff += "\n</body></html>\n"

    # encode to return bytes.
    return diff.encode()

コード例 #41

0

ファイルを表示

ファイル: views.py プロジェクト: ashifkalladi/questionbank-stackoverflow-clone-in-django

def question_revisions(request, question_id):
    question = get_object_or_404(Question, id=question_id)
    revisions = list(question.revisions.all())
    populate_foreign_key_caches(User, ((revisions, ('author',)),),
         fields=('username', 'gravatar', 'reputation', 'gold', 'silver',
                 'bronze'))
    for i, revision in enumerate(revisions):
        revision.html = QUESTION_REVISION_TEMPLATE % {
            'title': revision.title,
            'html': sanitize_html(markdowner.convert(revision.text)),
            'tags': ' '.join(['<a class="tag">%s</a>' % tag
                              for tag in revision.tagnames.split(' ')]),
        }
        if i > 0:
            revisions[i - 1].diff = htmldiff(revision.html,
                                             revisions[i - 1].html)
    return render_to_response('question_revisions.html', {
        'title': u'Question Revisions',
        'question': question,
        'revisions': revisions,
    }, context_instance=RequestContext(request))

コード例 #42

0

ファイルを表示

def diff_texts(text1, text2):
    # differ = dmp.diff_match_patch()

    # Remove HTML tags and duplicate \n if specified (helps to ignore diff plugin ids)
    if REVERSION2_DIFF_TEXT_ONLY:
        text1 = BeautifulSoup(text1, features="lxml").get_text()
        text2 = BeautifulSoup(text2, features="lxml").get_text()

    if REVERSION2_IGNORE_WHITESPACE:
        text1 = re.sub(r'\n+', '\n', text1).strip()
        text2 = re.sub(r'\n+', '\n', text2).strip()

    # diffs = differ.diff_main(text1, text2)
    # differ.diff_cleanupEfficiency(diffs)
    #
    # diffs = revert_escape(differ.diff_prettyHtml(diffs))

    from lxml.html.diff import htmldiff
    diffs = htmldiff(text1, text2)

    return diffs

コード例 #43

0

ファイルを表示

ファイル: swchook.py プロジェクト: Xarthisius/polyphemus

    def _generate_diffs(self):
        self._updater.update(
            status='pending', 
            description="Creating head and base website diffs.")

        for f in self._files:
            froot, fext = os.path.splitext(f)
            if fext not in HTML_EXTS:
                f = froot + '.html'
            f = os.path.join("_site", f)
            fpath, fname = os.path.split(f)

            head = os.path.join(self._head_dir, f)
            base = os.path.join(self._base_dir, f)
            diff = os.path.join(self._head_dir, fpath, "diff-" + fname)

            # if addition or deletion, just skip
            if not os.path.isfile(head) or not os.path.isfile(base):
                continue

            with open(base, 'r') as f:
                doc1 = lxml.html.parse(f)
 
            with open(head, 'r') as f:
                doc2 = lxml.html.parse(f)
 
            doc1body = doc1.find('body')
            doc2body = doc2.find('body')

            bodydiff = htmldiff(lxml.html.tostring(doc1body, encoding='utf-8').decode('utf-8'),
                                lxml.html.tostring(doc2body, encoding='utf-8').decode('utf-8'))
            doc2head = doc2.find('head')
            add_stylesheet(doc2head)
            diffdoc = u'<html>\n{0}\n<body>\n{1}\n</body>\n</html>'
            diffdoc = diffdoc.format(lxml.html.tostring(doc2head, encoding='utf-8').decode('utf-8'), bodydiff)

            with io.open(diff, 'wb') as f:
                f.write(diffdoc.encode('utf-8'))
            print("diff'd {0!r}".format(diff))

コード例 #44

0

ファイルを表示

ファイル: librarian.py プロジェクト: justzx2011/Robottke

def diff_rss(url, name, limit=-1):  
    rss = feedparser.parse(url)
    links = {}
    #print rss
    if limit==-1 or limit > len(rss.entries):
        limit = len(rss.entries)
    first_index = get_first_index(rss)
    for i in range(first_index, limit+1):
        links[rss.entries[i].link] = []
        post1 = rss.entries[i-1].link
        if i == limit:
            post2 = rss.entries[first_index-1].link
        else:
            post2 = rss.entries[i].link
        print post2
        diffh = htmldiff(get_content(post1)["body"], get_content(post2)["body"])
        tree = etree.parse(StringIO.StringIO(diffh), parser)
        diff = tree.xpath("//ins//@href")
        for d in diff:
            if urlparse(d).netloc != urlparse(rss.feed.link).netloc and urlparse(d).path != '/':
                links[rss.entries[i].link].append(d)
    return links

コード例 #45

0

ファイルを表示

def difference2(test_case_id, test_case_rev):
    if (test_case_rev == "1"):
        return parse_xml(test_case_id, test_case_rev)
    elif parse_xml(test_case_id, str(
        (int(test_case_rev) - 1))) == parse_xml(test_case_id, test_case_rev):
        return "<form id=err>No steps changes in the revision %s</form>" % test_case_rev + parse_xml(
            test_case_id, test_case_rev)
    else:
        old = parse_xml(test_case_id, str((int(test_case_rev) - 1)))
        new = parse_xml(test_case_id, test_case_rev)
        diff_html = htmldiff(old, new)
        diff_html = diff_html.replace("<del>", "<del><font color=red>")
        diff_html = diff_html.replace("</del>", "</del></font>")
        diff_html = diff_html.replace("<ins>", "<ins><font color=green>")
        diff_html = diff_html.replace("</ins>", "</ins></font>")
        return diff_html


# print(parse_xml("409770","11"))

# print(get_t_c_rev("409770","32"))

# print(parse_html("446114 ", "18"))

コード例 #46

0

ファイルを表示

ファイル: smartread.py プロジェクト: justzx2011/Robottke

def smart_read(url):
    resp = urllib2.urlopen(url)
    #resolve url
    url = resp.url
    domain = urlparse(url).netloc
    path = urlparse(url).path
    
    html = resp.read()
    tree = etree.parse(StringIO.StringIO(html), parser)
    links = tree.xpath("//body//@href")
    nmax = 0
    for link in links:
        if urlparse(link).netloc == domain:
            ng = NGram.compare(urlparse(link).path,path)
            #print link,ng
            if ng > nmax and ng < 1:
                nmax = ng
                mirror = link
    diffh = htmldiff(visit_page(url)["body"], visit_page(mirror)["body"])
    tree = etree.parse(StringIO.StringIO(diffh), parser)
    diff = tree.xpath("//ins//text()")
    for d in diff:
        print d

コード例 #47

0

ファイルを表示

ファイル: DomComparator.py プロジェクト: dip-kush/Ajax-Crawler

def getHtmlDiff(dom1, dom2, tagCount1, tagCount2):
    diff1 = htmldiff(dom1, dom2)
    #diff2 = htmldiff(dom2 ,dom1)
    #print diff1
    '''
    if len(diff1) > len(diff2):
        diff = diff1
        tagCount = tagCount1
    else:
        diff = diff2
        tagCount = tagCount2
    '''
    diff = diff1
    tagCount = tagCount1
    #diff = diff1
    bdiff = BeautifulSoup(diff)
    ins = ''.join(str(bdiff.findAll("ins")))
    print ins
    delete = ''.join(str(bdiff.findAll("del")))
    print cleanDom(delete)
    diffDom = cleanDom(ins)
    print diffDom
    return diffDom

コード例 #48

0

ファイルを表示

ファイル: htmldiff.py プロジェクト: Spaceghost/magmawiki

#!/usr/bin/env python
from sys import argv
from lxml.html.diff import htmldiff

def help():
  print "----------------------------------"
  print "          An diff tool!"  
  print "----------------------------------"
  print
  print "Usage: <file_a> <file_b>"
  print "Output: HTML"
  print
  print "Example output:"
  print "    <ins>hello</ins><del>goodbye cruel</del> world"
  print
  

if __name__ == "__main__":
  
  if len(argv) != 3:
    help()
    exit(1)
    
  else:
    file_a, file_b = argv[1:]
    
    a = open(file_a).read()
    b = open(file_b).read()
    
    print htmldiff(a, b)

コード例 #49

0

ファイルを表示

ファイル: views.py プロジェクト: GunioRobot/hubplus

def htmldiffer(ver_1, ver_2):
    content = htmldiff(ver_2.content, ver_1.content)
    license = htmldiff(ver_2.license, ver_1.license)
    title = htmldiff(ver_2.title, ver_1.title)
    return {'content': content, 'license':license, 'title':title}

コード例 #50

0

ファイルを表示

ファイル: notifications.py プロジェクト: buxx/tracim

    def _build_email_body(self, mako_template_filepath: str, role: UserRoleInWorkspace, content: Content, actor: User) -> str:
        """
        Build an email body and return it as a string
        :param mako_template_filepath: the absolute path to the mako template to be used for email body building
        :param role: the role related to user to whom the email must be sent. The role is required (and not the user only) in order to show in the mail why the user receive the notification
        :param content: the content item related to the notification
        :param actor: the user at the origin of the action / notification (for example the one who wrote a comment
        :param config: the global configuration
        :return: the built email body as string. In case of multipart email, this method must be called one time for text and one time for html
        """
        logger.debug(self, 'Building email content from MAKO template {}'.format(mako_template_filepath))

        template = Template(filename=mako_template_filepath)
        # TODO - D.A. - 2014-11-06 - move this
        # Import is here for circular import problem
        import tracim.lib.helpers as helpers

        dictified_item = Context(CTX.EMAIL_NOTIFICATION, self._global_config.WEBSITE_BASE_URL).toDict(content)
        dictified_actor = Context(CTX.DEFAULT).toDict(actor)

        main_title = dictified_item.label
        content_intro = ''
        content_text = ''
        call_to_action_text = ''

        action = content.get_last_action().id
        if ActionDescription.COMMENT == action:
            content_intro = _('<span id="content-intro-username">{}</span> added a comment:').format(actor.display_name)
            content_text = content.description
            call_to_action_text = _('Answer')

        elif ActionDescription.CREATION == action:

            # Default values (if not overriden)
            content_text = content.description
            call_to_action_text = _('View online')

            if ContentType.Thread == content.type:
                call_to_action_text = _('Answer')
                content_intro = _('<span id="content-intro-username">{}</span> started a thread entitled:').format(actor.display_name)
                content_text = '<p id="content-body-intro">{}</p>'.format(content.label) + \
                               content.get_last_comment_from(actor).description

            elif ContentType.File == content.type:
                content_intro = _('<span id="content-intro-username">{}</span> added a file entitled:').format(actor.display_name)
                if content.description:
                    content_text = content.description
                else:
                    content_text = '<span id="content-body-only-title">{}</span>'.format(content.label)

            elif ContentType.Page == content.type:
                content_intro = _('<span id="content-intro-username">{}</span> added a page entitled:').format(actor.display_name)
                content_text = '<span id="content-body-only-title">{}</span>'.format(content.label)

        elif ActionDescription.REVISION == action:
            content_text = content.description
            call_to_action_text = _('View online')

            if ContentType.File == content.type:
                content_intro = _('<span id="content-intro-username">{}</span> uploaded a new revision.').format(actor.display_name)
                content_text = ''

            elif ContentType.Page == content.type:
                content_intro = _('<span id="content-intro-username">{}</span> updated this page.').format(actor.display_name)
                previous_revision = content.get_previous_revision()
                title_diff = ''
                if previous_revision.label != content.label:
                    title_diff = htmldiff(previous_revision.label, content.label)
                content_text = _('<p id="content-body-intro">Here is an overview of the changes:</p>')+ \
                    title_diff + \
                    htmldiff(previous_revision.description, content.description)

            elif ContentType.Thread == content.type:
                content_intro = _('<span id="content-intro-username">{}</span> updated the thread description.').format(actor.display_name)
                previous_revision = content.get_previous_revision()
                title_diff = ''
                if previous_revision.label != content.label:
                    title_diff = htmldiff(previous_revision.label, content.label)
                content_text = _('<p id="content-body-intro">Here is an overview of the changes:</p>')+ \
                    title_diff + \
                    htmldiff(previous_revision.description, content.description)

            # elif ContentType.Thread == content.type:
            #     content_intro = _('<span id="content-intro-username">{}</span> updated this page.').format(actor.display_name)
            #     previous_revision = content.get_previous_revision()
            #     content_text = _('<p id="content-body-intro">Here is an overview of the changes:</p>')+ \
            #         htmldiff(previous_revision.description, content.description)

        elif ActionDescription.EDITION == action:
            call_to_action_text = _('View online')

            if ContentType.File == content.type:
                content_intro = _('<span id="content-intro-username">{}</span> updated the file description.').format(actor.display_name)
                content_text = '<p id="content-body-intro">{}</p>'.format(content.get_label()) + \
                    content.description


        if '' == content_intro and content_text == '':
            # Skip notification, but it's not normal
            logger.error(
                self, 'A notification is being sent but no content. '
                      'Here are some debug informations: [content_id: {cid}]'
                      '[action: {act}][author: {actor}]'.format(
                    cid=content.content_id, act=action, actor=actor
                )
            )
            raise ValueError('Unexpected empty notification')

        # Import done here because cyclic import
        from tracim.config.app_cfg import CFG
        body_content = template.render(
            base_url=self._global_config.WEBSITE_BASE_URL,
            _=_,
            h=helpers,
            user_display_name=role.user.display_name,
            user_role_label=role.role_as_label(),
            workspace_label=role.workspace.label,
            content_intro=content_intro,
            content_text=content_text,
            main_title=main_title,
            call_to_action_text=call_to_action_text,
            result = DictLikeClass(item=dictified_item, actor=dictified_actor),
            CFG=CFG.get_instance(),
        )

        return body_content

コード例 #51

0

ファイルを表示

ファイル: urlDiff.py プロジェクト: skandaloptagon/AL_EXAmine

def do_stuff(i=0):
    temp = glob.glob('content/*')
    size = len(temp)/6

    # break the glob into 6 parts. 1 for each process and choose the 1/6th 
    # associated with the process
    temp = temp[int(i*size):int((i+1)*size)]
    
    logging.info("Starting chunk " + str(int(i*size)))

    for domain in temp:

        # Ignore the timestamps and get the unique paths
        uniq_urls = set()

        # get the list of uniq urls in each domain
        for filename in glob.glob(domain+'/*'):
            uniq_urls.add('_'.join(filename.split('_')[:-1]))

        # iterate the uniq urls
        for path in uniq_urls:

            # Get the timestamps for each uniq filename
            timestamps = set()
            for filename in glob.glob(path + '*'):
                try:
                    timestamps.add(float('.'.join(filename.split('_')[-1].split('.')[:-1])))
                except Exception as e:
                    pass
            try:
                timestamps = list(timestamps)

                # this needs to be done in order.
                timestamps.sort()

                #more parsing
                temp1 = path + '_' + str(timestamps[0]) + '.gz'
                
                # interate the copies in order
                for i in timestamps[1:]:

                    # construct the filename
                    temp2 = path + '_' + str(i) + '.gz'
                    
                    # construct the output file name
                    filename = "diffs/"+path+'_'+str(i)+'_diff.gz'

                    # check make sure the output folder exists
                    dir = os.path.dirname(filename)
                    if not os.path.exists(dir):
                        os.makedirs(dir)

                    # Skip the ones that have already been diffed
                    if not os.path.isfile(filename):

                        # Open the output file
                        with open(filename,'w') as f:
                            try:
                                try:
                                    # this is the sauce.  All of the diffing and writing happens in this line.
                                    f.write(htmldiff(gzip.open(temp1).read(),gzip.open(temp2).read()))
                                    logging.debug("successful write "+filename)
                                except Exception:
                                    # sometimes it doesn't like the sauce so I created verde.
                                    f.write(unicode(htmldiff(gzip.open(temp1).read().decode('utf-8','ignore'),gzip.open(temp2).read().decode('utf-8','ignore'))).encode('utf-8','ignore'))
                                    logging.debug("transcode write "+filename)
                            except IOError as e:
                                logging.debug("missing file " + filename)
                            except AssertionError as e:
                                logging.debug("Assertion Error " + filename + " : " + e)
                    else:
                        logging.debug("already exists " + filename)
                    temp1 = temp2
            except TypeError as e:
                print e

コード例 #52

0

ファイルを表示

ファイル: comparing.py プロジェクト: biomaks/monitor

 def show_diff(old_diff, new_diff):
     return htmldiff(old_diff, new_diff)

コード例 #53

0

ファイルを表示

ファイル: widgets.py プロジェクト: Lehych/iktomi-cms

 def value(self):
     return htmldiff(self.value1, self.value2)

コード例 #54

0

ファイルを表示

ファイル: DomComparator.py プロジェクト: dip-kush/CrawlerUI

def getDomDiff(parentDom, childDom):
    html = htmldiff(parentDom, childDom)
    bshtml = BeautifulSoup(html)
    ins = ''.join(str(bshtml.findAll("ins")))
    diffDom = cleanDom(ins)
    return diffDom

コード例 #55

0

ファイルを表示

ファイル: views.py プロジェクト: ejucovy/django-svenweb

            new_contents = site.get_page(subpath, rev=new)
        except sven.ResourceUnchanged, e:
            new = e.last_change
            resource_unchanged = True
    except sven.NotAFile:
        return redirect(site.directory_index_url(subpath))
    except sven.NoSuchResource:
        return redirect(site.history_url(subpath))
    except sven.FutureRevision:
        return redirect(site.history_url(subpath))
    if resource_unchanged:
        return redirect(site.page_diff_url(subpath)
                        + "?versions=%s,%s" % (old, new))

    # @@todo: raw diff? binary diff?
    contents = htmldiff(old_contents, new_contents)
    mimetype = mimetypes.guess_type(subpath)[0]
    return dict(site=site, contents=contents, mimetype=mimetype, path=subpath)

@requires("WIKI_EDIT")
@allow_http("GET", "POST")
@rendered_with("sites/site/page-create.html")
def page_create(request, subpath):
    site = request.site

    if request.method == "POST":
        path = request.POST['path']

        # @@todo: don't slugify for raw wikis? dunno
        from django.template.defaultfilters import slugify
        path = '/'.join(slugify(i) for i in path.split('/'))

コード例 #56

0

ファイルを表示

ファイル: diff.py プロジェクト: AnonOnWarpath/adhocracy

def _diff_html(left, right):
    return htmldiff(left, right)