def show_diff(self, old, new): if len(old) == 0 or len(new) == 0: logging.info('Old or New empty') return False new_hash = hashlib.sha224(new.encode('utf8')).hexdigest() logging.info(html_diff(old, new)) html = """ <!doctype html> <html lang="en"> <head> <meta charset="utf-8"> <link rel="stylesheet" href="./css/styles.css"> </head> <body> <p> {} </p> </body> </html> """.format(html_diff(old, new)) with open('tmp.html', 'w') as f: f.write(html) CHROMEDRIVER_PATH = os.environ.get('CHROMEDRIVER_PATH', '/usr/local/bin/chromedriver') driver = webdriver.Chrome(CHROMEDRIVER_PATH) driver.get('file://%s/tmp.html' % os.getcwd()) e = driver.find_element_by_xpath('//p') start_height = e.location['y'] block_height = e.size['height'] end_height = start_height start_width = e.location['x'] block_width = e.size['width'] end_width = start_width total_height = start_height + block_height + end_height total_width = start_width + block_width + end_width timestamp = str(int(time.time())) driver.save_screenshot('./tmp.png') img = Image.open('./tmp.png') img2 = img.crop((0, 0, total_width, total_height)) if int(total_width) > int(total_height * 2): background = Image.new('RGBA', (total_width, int(total_width / 2)), (255, 255, 255, 0)) bg_w, bg_h = background.size offset = (int( (bg_w - total_width) / 2), int((bg_h - total_height) / 2)) else: background = Image.new('RGBA', (total_width, total_height), (255, 255, 255, 0)) bg_w, bg_h = background.size offset = (int( (bg_w - total_width) / 2), int((bg_h - total_height) / 2)) background.paste(img2, offset) self.filename = timestamp + new_hash background.save('./output/' + self.filename + '.png') return True
def generate_diff(old, new): if len(old) == 0 or len(new) == 0: logging.info('Old or New empty') return False new_hash = hashlib.sha224(new.encode('utf8')).hexdigest() logging.info(html_diff(old, new)) html = """ <!doctype html> <html lang="en"> <head> <meta charset="utf-8"> <link rel="stylesheet" href="./css/styles.css"> </head> <body> <p> {} </p> </body> </html> """.format(html_diff(old, new)) with open('tmp.html', 'w') as f: f.write(html) driver = webdriver.PhantomJS(executable_path=PHANTOMJS_PATH + '/phantomjs') driver.get('tmp.html') e = driver.find_element_by_xpath('//p') start_height = e.location['y'] block_height = e.size['height'] end_height = start_height start_width = e.location['x'] block_width = e.size['width'] end_width = start_width total_height = start_height + block_height + end_height total_width = start_width + block_width + end_width timestamp = str(int(time.time())) driver.save_screenshot('./tmp.png') img = Image.open('./tmp.png') img2 = img.crop((0, 0, total_width, total_height)) if int(total_width) > int(total_height * 2): background = Image.new('RGBA', (total_width, int(total_width / 2)), (255, 255, 255, 0)) bg_w, bg_h = background.size offset = (int((bg_w - total_width) / 2), int( (bg_h - total_height) / 2)) else: background = Image.new('RGBA', (total_width, total_height), (255, 255, 255, 0)) bg_w, bg_h = background.size offset = (int((bg_w - total_width) / 2), int( (bg_h - total_height) / 2)) background.paste(img2, offset) filename = timestamp + new_hash exported_filename = './output/' + filename + '.png' background.save(exported_filename) return True, exported_filename
def correctTranslation(): data = request.get_json() translation = data["translation"] beam = data["beam"] document_unk_map = data["document_unk_map"] attention = data["attention"] document_id = data["document_id"] sentence_id = data["sentence_id"] document = get_document(document_id) extractor = DomainSpecificExtractor( source_file=document.filepath, train_source_file="myseq2seq/data/wmt14/train.tok.clean.bpe.32000.de", train_vocab_file="myseq2seq/train_vocab.pkl") keyphrases = extractor.extract_keyphrases() import uuid for key in document_unk_map: if key not in document.unk_map: document.unk_map[key] = document_unk_map[key] else: # Merge list values document.unk_map[key] = list( set(document.unk_map[key]) | set(document_unk_map[key])) sentence = document.sentences[int(sentence_id)] if translation != sentence.translation: sentence.diff = html_diff(sentence.translation[:-6].replace("@@ ", ""), translation[:-6].replace("@@ ", "")) sentence.translation = translation sentence.corrected = True sentence.flagged = False sentence.attention = attention sentence.beam = beam scorer = Scorer() score = scorer.compute_scores(sentence.source, sentence.translation, attention, keyphrases) score["order_id"] = sentence.score["order_id"] sentence.score = score document.sentences[int(sentence_id)] = sentence save_document(document, document_id) from myseq2seq.train import train_iters pairs = [sentence.source, sentence.translation[:-6]] print(pairs) # train_iters(seq2seq_model.encoder, seq2seq_model.decoder, seq2seq_model.input_lang, seq2seq_model.output_lang, # pairs, batch_size=1, print_every=1, n_epochs=1) return jsonify({})
def correctTranslation(): data = request.get_json() translation = data["translation"] beam = data["beam"] document_unk_map = data["document_unk_map"] attention = data["attention"] document_id = data["document_id"] sentence_id = data["sentence_id"] document = get_document(document_id) extractor = DomainSpecificExtractor(source_file=document.filepath, src_lang=SRC_LANG, tgt_lang=TGT_LANG, train_source_file=f".data/wmt14/train.tok.clean.bpe.32000.{SRC_LANG}", train_vocab_file=f".data/vocab/train_vocab_{SRC_LANG}.pkl") keyphrases = extractor.extract_keyphrases() for key in document_unk_map: if key not in document.unk_map: document.unk_map[key] = document_unk_map[key] else: # Merge list values document.unk_map[key] = list(set(document.unk_map[key]) | set(document_unk_map[key])) sentence = document.sentences[int(sentence_id)] if translation != sentence.translation: sentence.diff = html_diff(sentence.translation[:-4].replace("@@ ", ""), translation[:-4].replace("@@ ", "")) sentence.translation = translation sentence.corrected = True sentence.flagged = False sentence.attention = attention sentence.beam = beam scorer = Scorer() score = scorer.compute_scores(sentence.source, sentence.translation, attention, keyphrases, "") score["order_id"] = sentence.score["order_id"] sentence.score = score document.sentences[int(sentence_id)] = sentence save_document(document, document_id) return jsonify({})
def generate_image_diff(old: str, new: str, text_to_tweet: str): ImageDiffGenerator.init() stripped_old = ImageDiffGenerator.separate_punctuation(strip_html(old)) stripped_new = ImageDiffGenerator.separate_punctuation(strip_html(new)) new_hash = hashlib.sha224(stripped_new.encode('utf8')).hexdigest() separated_diff = html_diff(stripped_old, stripped_new) diff_html = ImageDiffGenerator.restore_punctuation(separated_diff) html = ImageDiffGenerator.html_template.replace("text_to_tweet", text_to_tweet) \ .replace("diff_html", diff_html) with open('tmp.html', 'w', encoding="utf-8") as f: f.write(html) ImageDiffGenerator.driver.get('tmp.html') e = ImageDiffGenerator.driver.find_element_by_id('wrapper') start_height = e.location['y'] block_height = e.size['height'] end_height = start_height total_height = start_height + block_height + end_height total_width = 510 # Override because body width is set to 500 timestamp = str(int(time.time())) ImageDiffGenerator.driver.save_screenshot('./tmp.png') img = Image.open('./tmp.png') img2 = img.crop((0, 0, total_width, total_height)) if int(total_width) > int(total_height * 2): background = Image.new('RGBA', (total_width, int(total_width / 2)), (255, 255, 255, 0)) bg_w, bg_h = background.size offset = (int( (bg_w - total_width) / 2), int((bg_h - total_height) / 2)) else: background = Image.new('RGBA', (total_width, total_height), (255, 255, 255, 0)) bg_w, bg_h = background.size offset = (int( (bg_w - total_width) / 2), int((bg_h - total_height) / 2)) background.paste(img2, offset) filename = timestamp + new_hash saved_file_path = f'./output/{filename}.png' background.save(saved_file_path) return saved_file_path
def retranslateSentenceWithId(i, sentence, scorer, keyphrases, num_changes, beam_size=3, attLayer=-2, force=False): print("Retranslate: " + str(i)) if sentence.corrected and not force: return sentence, num_changes translation, attn, translations = model.translate(sentence.source, beam_size=beam_size, attLayer=attLayer) beam = translationsToTree(translations) score = scorer.compute_scores(sentence.source, " ".join(translation), attn, keyphrases, "") score["order_id"] = i translation_text = " ".join(translation) if translation_text != sentence.translation: num_changes += 1 sentence.diff = html_diff(sentence.translation[:-4].replace("@@ ", ""), translation_text[:-4].replace("@@ ", "")) sentence.translation = translation_text sentence.beam = beam sentence.score = score sentence.attention = attn return sentence, num_changes
def retranslate(document_id): document = get_document(document_id) scorer = Scorer() extractor = DomainSpecificExtractor( source_file=document.filepath, train_source_file="myseq2seq/data/wmt14/train.tok.clean.bpe.32000.de", train_vocab_file="myseq2seq/train_vocab.pkl") keyphrases = extractor.extract_keyphrases() num_changes = 0 for i, sentence in enumerate(document.sentences): if sentence.corrected: continue translation, attn, translations = seq2seq_model.translate( sentence.source) beam = translationsToTree(translations) score = scorer.compute_scores(sentence.source, " ".join(translation), attn, keyphrases) score["order_id"] = i translation_text = " ".join(translation) if translation_text != sentence.translation: num_changes += 1 sentence.diff = html_diff( sentence.translation[:-4].replace("@@ ", ""), translation_text[:-4].replace("@@ ", "")) sentence.translation = translation_text sentence.beam = beam sentence.score = score sentence.attention = attn save_document(document, document_id) return jsonify({"numChanges": num_changes})
def get_data(self): # load all of anchores self.get_all_anchor_frontpage() # transform anchors to dict form self.transform_anchor_to_dict() # check data in db for article_id in self.anchor_dict.keys(): print(article_id, self.anchor_dict[article_id]['art_route']) self.get_article_data(article_id, self.anchor_dict[article_id]['art_route']) temp_ord_dict = collections.OrderedDict(sorted(self.anchor_dict[article_id].items())) del temp_ord_dict['id_'] del temp_ord_dict['epoch_app_start'] del temp_ord_dict['date_app_start'] del temp_ord_dict['epoch_app_save'] del temp_ord_dict['date_app_save'] del temp_ord_dict['last_checkup'] self.anchor_dict[article_id]['article_hash'] = hashlib.sha224( repr(temp_ord_dict.items()).encode('utf-8')).hexdigest() if self.article_db.find_one(art_id=article_id) is None: # save new data logging.info('Adding new article: {article_url}'.format(article_url=self.anchor_dict[article_id])) self.anchor_dict[article_id]['article_version'] = 1 self.article_db.insert(self.anchor_dict[article_id]) else: logging.info('Updating article: {article_url}'.format(article_url=self.anchor_dict[article_id])) # update article if there is a reason check_last_version = self.db_file.query("""SELECT rowid, * FROM tvp_news WHERE art_id = "{art_id}" ORDER BY epoch_app_save DESC LIMIT 1""".format(art_id=article_id)) for row_ in check_last_version: if row_['article_hash'] != self.anchor_dict[article_id]['article_hash']: logging.info('Logging change for: {article_url}'.format(article_url=self.anchor_dict[article_id])) self.anchor_dict[article_id]['article_version'] = int(row_['article_version']) + 1 if row_['art_route'] != self.anchor_dict[article_id]['art_route']: self.anchor_dict[article_id]['art_route_change'] = html_diff(row_['art_route'], self.anchor_dict[article_id]['art_route']) self.prepare_img(article_id, 'art_route_change') insta_txt = 'Change in link' + \ + '\r\n' \ + '#tvp #tvpinfo #monitormedia' self.insta_msg(insta_txt) if row_['art_route_txt'] != self.anchor_dict[article_id]['art_route_txt']: self.anchor_dict[article_id]['art_route_txt_change'] = html_diff(row_['art_route_txt'], self.anchor_dict[article_id][ 'art_route_txt']) self.prepare_img(article_id, 'art_route_txt_change') insta_txt = 'Change in link text' + \ + '\r\n' \ + '#tvp #tvpinfo #monitormedia' self.insta_msg(insta_txt) if row_['headline_txt'] != self.anchor_dict[article_id]['headline_txt']: self.anchor_dict[article_id]['headline_change'] = html_diff(row_['headline_txt'], self.anchor_dict[article_id][ 'headline_txt']) self.prepare_img(article_id, 'headline_change') insta_txt = 'Change in article headline' + \ + '\r\n' \ + '#tvp #tvpinfo #monitormedia' self.insta_msg(insta_txt) if row_['article_txt'] != self.anchor_dict[article_id]['article_txt']: self.anchor_dict[article_id]['art_txt_change'] = html_diff(row_['article_txt'], self.anchor_dict[article_id][ 'article_txt']) self.prepare_img(article_id, 'art_txt_change') insta_txt = 'Change in article text' + \ + '\r\n' \ + '#tvp #tvpinfo #monitormedia' self.insta_msg(insta_txt) self.article_db.insert(self.anchor_dict[article_id]) else: logging.info('Update with no change for: {article_url}'.format(article_url=self.anchor_dict[article_id])) update_data = dict(id=row_['id'], last_checkup=self.anchor_dict[article_id]['last_checkup']) self.article_db.update(update_data, ['id']) self.inst_stories()
def parent_diff(self,obj): if not obj.parent: return "This object has no parent" return html_diff(obj.parent.text,obj.text)
r = r.replace("<del>", " **[-") r = r.replace("</del>", "-]** ") return r def shorten(s, length): return s if len(s) < length else (s[:length] + "...") def html_to_text(html): return BeautifulSoup(html, 'html.parser').get_text() if __name__ == '__main__': old = "【全部公告本科生院 研究生院关于2020-2021学年秋冬学期课程调整安排的通知】 各学院(系),行政各部门,各校区管委会,直属各单位,各任课教师、各位同学:" new = "【全部公告研究生院、本科生院 关于2020-2021学年秋冬学期课程调整安排的通知】 各学院(系),行政各部门,各校区管委会,直属各单位,各任课教师、各位同学:" r1 = html_diff(old, new) r2 = html_diff2(old, new) r3 = html_diff_to_markdown(r1) r4 = html_diff_to_markdown(r2) print(old) print(new) print(r1) print(r2) print(r3) print(r4) # print(diff_result)
from Bible import Bible from simplediff import html_diff # open our two bibles kjv = Bible('bibles/kjv.xml') nkjv = Bible('bibles/nkjv.xml') extension = '.mdown' directory = './kjv_vs_nkjv/' # go in order through all books, chapters, verses for book in kjv.bible: print 'opening %s%s.html for writing' % (directory, book) f = open('%s%s%s' % (directory, book, extension), 'w') for chapter in kjv.bible[book]: for verse in kjv.bible[book][chapter]: kjv_body = kjv.bible[book][chapter][verse] nkjv_body = nkjv.bible[book][chapter][verse] # Only print is something is differnt if kjv_body != nkjv_body: data = '<p><b>[%s %s:%s]</b> %s</p>' % \ (book, chapter, verse, html_diff(kjv_body, nkjv_body)) f.write(data) f.close()