Beispiel #1
0
    def show_diff(self, old, new):
        if len(old) == 0 or len(new) == 0:
            logging.info('Old or New empty')
            return False
        new_hash = hashlib.sha224(new.encode('utf8')).hexdigest()
        logging.info(html_diff(old, new))
        html = """
        <!doctype html>
        <html lang="en">
          <head>
            <meta charset="utf-8">
            <link rel="stylesheet" href="./css/styles.css">
          </head>
          <body>
          <p>
          {}
          </p>
          </body>
        </html>
        """.format(html_diff(old, new))
        with open('tmp.html', 'w') as f:
            f.write(html)

        CHROMEDRIVER_PATH = os.environ.get('CHROMEDRIVER_PATH',
                                           '/usr/local/bin/chromedriver')
        driver = webdriver.Chrome(CHROMEDRIVER_PATH)
        driver.get('file://%s/tmp.html' % os.getcwd())
        e = driver.find_element_by_xpath('//p')
        start_height = e.location['y']
        block_height = e.size['height']
        end_height = start_height
        start_width = e.location['x']
        block_width = e.size['width']
        end_width = start_width
        total_height = start_height + block_height + end_height
        total_width = start_width + block_width + end_width
        timestamp = str(int(time.time()))
        driver.save_screenshot('./tmp.png')
        img = Image.open('./tmp.png')
        img2 = img.crop((0, 0, total_width, total_height))
        if int(total_width) > int(total_height * 2):
            background = Image.new('RGBA', (total_width, int(total_width / 2)),
                                   (255, 255, 255, 0))
            bg_w, bg_h = background.size
            offset = (int(
                (bg_w - total_width) / 2), int((bg_h - total_height) / 2))
        else:
            background = Image.new('RGBA', (total_width, total_height),
                                   (255, 255, 255, 0))
            bg_w, bg_h = background.size
            offset = (int(
                (bg_w - total_width) / 2), int((bg_h - total_height) / 2))
        background.paste(img2, offset)
        self.filename = timestamp + new_hash
        background.save('./output/' + self.filename + '.png')
        return True
Beispiel #2
0
def generate_diff(old, new):
    if len(old) == 0 or len(new) == 0:
        logging.info('Old or New empty')
        return False
    new_hash = hashlib.sha224(new.encode('utf8')).hexdigest()
    logging.info(html_diff(old, new))
    html = """
    <!doctype html>
    <html lang="en">
      <head>
        <meta charset="utf-8">
        <link rel="stylesheet" href="./css/styles.css">
      </head>
      <body>
      <p>
      {}
      </p>
      </body>
    </html>
    """.format(html_diff(old, new))
    with open('tmp.html', 'w') as f:
        f.write(html)

    driver = webdriver.PhantomJS(executable_path=PHANTOMJS_PATH + '/phantomjs')
    driver.get('tmp.html')
    e = driver.find_element_by_xpath('//p')
    start_height = e.location['y']
    block_height = e.size['height']
    end_height = start_height
    start_width = e.location['x']
    block_width = e.size['width']
    end_width = start_width
    total_height = start_height + block_height + end_height
    total_width = start_width + block_width + end_width
    timestamp = str(int(time.time()))
    driver.save_screenshot('./tmp.png')
    img = Image.open('./tmp.png')
    img2 = img.crop((0, 0, total_width, total_height))
    if int(total_width) > int(total_height * 2):
        background = Image.new('RGBA', (total_width, int(total_width / 2)),
                               (255, 255, 255, 0))
        bg_w, bg_h = background.size
        offset = (int((bg_w - total_width) / 2), int(
            (bg_h - total_height) / 2))
    else:
        background = Image.new('RGBA', (total_width, total_height),
                               (255, 255, 255, 0))
        bg_w, bg_h = background.size
        offset = (int((bg_w - total_width) / 2), int(
            (bg_h - total_height) / 2))
    background.paste(img2, offset)
    filename = timestamp + new_hash
    exported_filename = './output/' + filename + '.png'
    background.save(exported_filename)

    return True, exported_filename
Beispiel #3
0
def correctTranslation():
    data = request.get_json()
    translation = data["translation"]
    beam = data["beam"]
    document_unk_map = data["document_unk_map"]
    attention = data["attention"]
    document_id = data["document_id"]
    sentence_id = data["sentence_id"]

    document = get_document(document_id)

    extractor = DomainSpecificExtractor(
        source_file=document.filepath,
        train_source_file="myseq2seq/data/wmt14/train.tok.clean.bpe.32000.de",
        train_vocab_file="myseq2seq/train_vocab.pkl")
    keyphrases = extractor.extract_keyphrases()

    import uuid

    for key in document_unk_map:
        if key not in document.unk_map:
            document.unk_map[key] = document_unk_map[key]
        else:
            # Merge list values
            document.unk_map[key] = list(
                set(document.unk_map[key]) | set(document_unk_map[key]))

    sentence = document.sentences[int(sentence_id)]

    if translation != sentence.translation:
        sentence.diff = html_diff(sentence.translation[:-6].replace("@@ ", ""),
                                  translation[:-6].replace("@@ ", ""))
    sentence.translation = translation
    sentence.corrected = True
    sentence.flagged = False
    sentence.attention = attention
    sentence.beam = beam

    scorer = Scorer()
    score = scorer.compute_scores(sentence.source, sentence.translation,
                                  attention, keyphrases)
    score["order_id"] = sentence.score["order_id"]
    sentence.score = score

    document.sentences[int(sentence_id)] = sentence

    save_document(document, document_id)

    from myseq2seq.train import train_iters
    pairs = [sentence.source, sentence.translation[:-6]]
    print(pairs)
    # train_iters(seq2seq_model.encoder, seq2seq_model.decoder, seq2seq_model.input_lang, seq2seq_model.output_lang,
    #           pairs, batch_size=1, print_every=1, n_epochs=1)

    return jsonify({})
Beispiel #4
0
def correctTranslation():
    data = request.get_json()
    translation = data["translation"]
    beam = data["beam"]
    document_unk_map = data["document_unk_map"]
    attention = data["attention"]
    document_id = data["document_id"]
    sentence_id = data["sentence_id"]

    document = get_document(document_id)

    extractor = DomainSpecificExtractor(source_file=document.filepath, src_lang=SRC_LANG, tgt_lang=TGT_LANG,
                                        train_source_file=f".data/wmt14/train.tok.clean.bpe.32000.{SRC_LANG}",
                                        train_vocab_file=f".data/vocab/train_vocab_{SRC_LANG}.pkl")
    keyphrases = extractor.extract_keyphrases()

    for key in document_unk_map:
        if key not in document.unk_map:
            document.unk_map[key] = document_unk_map[key]
        else:
            # Merge list values
            document.unk_map[key] = list(set(document.unk_map[key]) | set(document_unk_map[key]))

    sentence = document.sentences[int(sentence_id)]

    if translation != sentence.translation:
        sentence.diff = html_diff(sentence.translation[:-4].replace("@@ ", ""),
                                  translation[:-4].replace("@@ ", ""))
    sentence.translation = translation
    sentence.corrected = True
    sentence.flagged = False
    sentence.attention = attention
    sentence.beam = beam

    scorer = Scorer()
    score = scorer.compute_scores(sentence.source, sentence.translation, attention, keyphrases, "")
    score["order_id"] = sentence.score["order_id"]
    sentence.score = score

    document.sentences[int(sentence_id)] = sentence

    save_document(document, document_id)

    return jsonify({})
Beispiel #5
0
    def generate_image_diff(old: str, new: str, text_to_tweet: str):
        ImageDiffGenerator.init()
        stripped_old = ImageDiffGenerator.separate_punctuation(strip_html(old))
        stripped_new = ImageDiffGenerator.separate_punctuation(strip_html(new))
        new_hash = hashlib.sha224(stripped_new.encode('utf8')).hexdigest()
        separated_diff = html_diff(stripped_old, stripped_new)
        diff_html = ImageDiffGenerator.restore_punctuation(separated_diff)

        html = ImageDiffGenerator.html_template.replace("text_to_tweet", text_to_tweet) \
            .replace("diff_html", diff_html)

        with open('tmp.html', 'w', encoding="utf-8") as f:
            f.write(html)

        ImageDiffGenerator.driver.get('tmp.html')

        e = ImageDiffGenerator.driver.find_element_by_id('wrapper')
        start_height = e.location['y']
        block_height = e.size['height']
        end_height = start_height
        total_height = start_height + block_height + end_height
        total_width = 510  # Override because body width is set to 500
        timestamp = str(int(time.time()))
        ImageDiffGenerator.driver.save_screenshot('./tmp.png')
        img = Image.open('./tmp.png')
        img2 = img.crop((0, 0, total_width, total_height))
        if int(total_width) > int(total_height * 2):
            background = Image.new('RGBA', (total_width, int(total_width / 2)),
                                   (255, 255, 255, 0))
            bg_w, bg_h = background.size
            offset = (int(
                (bg_w - total_width) / 2), int((bg_h - total_height) / 2))
        else:
            background = Image.new('RGBA', (total_width, total_height),
                                   (255, 255, 255, 0))
            bg_w, bg_h = background.size
            offset = (int(
                (bg_w - total_width) / 2), int((bg_h - total_height) / 2))
        background.paste(img2, offset)
        filename = timestamp + new_hash
        saved_file_path = f'./output/{filename}.png'
        background.save(saved_file_path)
        return saved_file_path
Beispiel #6
0
def retranslateSentenceWithId(i, sentence, scorer, keyphrases, num_changes, beam_size=3, attLayer=-2, force=False):
    print("Retranslate: " + str(i))

    if sentence.corrected and not force:
        return sentence, num_changes

    translation, attn, translations = model.translate(sentence.source, beam_size=beam_size, attLayer=attLayer)
    beam = translationsToTree(translations)

    score = scorer.compute_scores(sentence.source, " ".join(translation), attn, keyphrases, "")
    score["order_id"] = i

    translation_text = " ".join(translation)
    if translation_text != sentence.translation:
        num_changes += 1
        sentence.diff = html_diff(sentence.translation[:-4].replace("@@ ", ""),
                                  translation_text[:-4].replace("@@ ", ""))
    sentence.translation = translation_text
    sentence.beam = beam
    sentence.score = score
    sentence.attention = attn

    return sentence, num_changes
Beispiel #7
0
def retranslate(document_id):
    document = get_document(document_id)
    scorer = Scorer()
    extractor = DomainSpecificExtractor(
        source_file=document.filepath,
        train_source_file="myseq2seq/data/wmt14/train.tok.clean.bpe.32000.de",
        train_vocab_file="myseq2seq/train_vocab.pkl")
    keyphrases = extractor.extract_keyphrases()
    num_changes = 0

    for i, sentence in enumerate(document.sentences):
        if sentence.corrected:
            continue

        translation, attn, translations = seq2seq_model.translate(
            sentence.source)

        beam = translationsToTree(translations)

        score = scorer.compute_scores(sentence.source, " ".join(translation),
                                      attn, keyphrases)
        score["order_id"] = i

        translation_text = " ".join(translation)
        if translation_text != sentence.translation:
            num_changes += 1
            sentence.diff = html_diff(
                sentence.translation[:-4].replace("@@ ", ""),
                translation_text[:-4].replace("@@ ", ""))
        sentence.translation = translation_text
        sentence.beam = beam
        sentence.score = score
        sentence.attention = attn

    save_document(document, document_id)
    return jsonify({"numChanges": num_changes})
Beispiel #8
0
    def get_data(self):

        # load all of anchores
        self.get_all_anchor_frontpage()
        # transform anchors to dict form
        self.transform_anchor_to_dict()
        # check data in db
        for article_id in self.anchor_dict.keys():

            print(article_id, self.anchor_dict[article_id]['art_route'])
            self.get_article_data(article_id, self.anchor_dict[article_id]['art_route'])
            temp_ord_dict = collections.OrderedDict(sorted(self.anchor_dict[article_id].items()))

            del temp_ord_dict['id_']
            del temp_ord_dict['epoch_app_start']
            del temp_ord_dict['date_app_start']
            del temp_ord_dict['epoch_app_save']
            del temp_ord_dict['date_app_save']
            del temp_ord_dict['last_checkup']

            self.anchor_dict[article_id]['article_hash'] = hashlib.sha224(
                repr(temp_ord_dict.items()).encode('utf-8')).hexdigest()

            if self.article_db.find_one(art_id=article_id) is None:
                # save new data
                logging.info('Adding new article: {article_url}'.format(article_url=self.anchor_dict[article_id]))

                self.anchor_dict[article_id]['article_version'] = 1
                self.article_db.insert(self.anchor_dict[article_id])

            else:

                logging.info('Updating article: {article_url}'.format(article_url=self.anchor_dict[article_id]))
                # update article if there is a reason
                check_last_version = self.db_file.query("""SELECT rowid, *
                                                            FROM tvp_news
                                                            WHERE art_id = "{art_id}"
                                                            ORDER BY epoch_app_save DESC
                                                            LIMIT 1""".format(art_id=article_id))

                for row_ in check_last_version:

                    if row_['article_hash'] != self.anchor_dict[article_id]['article_hash']:
                        logging.info('Logging change for: {article_url}'.format(article_url=self.anchor_dict[article_id]))
                        self.anchor_dict[article_id]['article_version'] = int(row_['article_version']) + 1

                        if row_['art_route'] != self.anchor_dict[article_id]['art_route']:

                            self.anchor_dict[article_id]['art_route_change'] = html_diff(row_['art_route'],
                                                                                         self.anchor_dict[article_id]['art_route'])

                            self.prepare_img(article_id, 'art_route_change')

                            insta_txt = 'Change in link' +  \
                                        + '\r\n' \
                                        + '#tvp #tvpinfo #monitormedia'

                            self.insta_msg(insta_txt)



                        if row_['art_route_txt'] != self.anchor_dict[article_id]['art_route_txt']:
                            self.anchor_dict[article_id]['art_route_txt_change'] = html_diff(row_['art_route_txt'],
                                                                                         self.anchor_dict[article_id][
                                                                                             'art_route_txt'])

                            self.prepare_img(article_id, 'art_route_txt_change')

                            insta_txt = 'Change in link text' + \
                                        + '\r\n' \
                                        + '#tvp #tvpinfo #monitormedia'

                            self.insta_msg(insta_txt)

                        if row_['headline_txt'] != self.anchor_dict[article_id]['headline_txt']:
                            self.anchor_dict[article_id]['headline_change'] = html_diff(row_['headline_txt'],
                                                                                         self.anchor_dict[article_id][
                                                                                             'headline_txt'])

                            self.prepare_img(article_id, 'headline_change')

                            insta_txt = 'Change in article headline' + \
                                        + '\r\n' \
                                        + '#tvp #tvpinfo #monitormedia'

                            self.insta_msg(insta_txt)

                        if row_['article_txt'] != self.anchor_dict[article_id]['article_txt']:
                            self.anchor_dict[article_id]['art_txt_change'] = html_diff(row_['article_txt'],
                                                                                         self.anchor_dict[article_id][
                                                                                             'article_txt'])

                            self.prepare_img(article_id, 'art_txt_change')

                            insta_txt = 'Change in article text' + \
                                        + '\r\n' \
                                        + '#tvp #tvpinfo #monitormedia'

                            self.insta_msg(insta_txt)

                        self.article_db.insert(self.anchor_dict[article_id])


                    else:
                        logging.info('Update with no change for: {article_url}'.format(article_url=self.anchor_dict[article_id]))
                        update_data = dict(id=row_['id'], last_checkup=self.anchor_dict[article_id]['last_checkup'])
                        self.article_db.update(update_data, ['id'])

        self.inst_stories()
Beispiel #9
0
 def parent_diff(self,obj):
   if not obj.parent:
     return "This object has no parent"
   return html_diff(obj.parent.text,obj.text)
Beispiel #10
0
    r = r.replace("<del>", " **[-")
    r = r.replace("</del>", "-]** ")
    return r


def shorten(s, length):
    return s if len(s) < length else (s[:length] + "...")


def html_to_text(html):
    return BeautifulSoup(html, 'html.parser').get_text()


if __name__ == '__main__':
    old = "【全部公告本科生院 研究生院关于2020-2021学年秋冬学期课程调整安排的通知】 各学院(系),行政各部门,各校区管委会,直属各单位,各任课教师、各位同学:"
    new = "【全部公告研究生院、本科生院 关于2020-2021学年秋冬学期课程调整安排的通知】 各学院(系),行政各部门,各校区管委会,直属各单位,各任课教师、各位同学:"

    r1 = html_diff(old, new)
    r2 = html_diff2(old, new)
    r3 = html_diff_to_markdown(r1)
    r4 = html_diff_to_markdown(r2)

    print(old)
    print(new)
    print(r1)
    print(r2)
    print(r3)
    print(r4)

# print(diff_result)
Beispiel #11
0
from Bible import Bible
from simplediff import html_diff

# open our two bibles 
kjv = Bible('bibles/kjv.xml')
nkjv = Bible('bibles/nkjv.xml')

extension = '.mdown'
directory = './kjv_vs_nkjv/'

# go in order through all books, chapters, verses
for book in kjv.bible:
    print 'opening %s%s.html for writing' % (directory, book)
    f = open('%s%s%s' % (directory, book, extension), 'w')
    for chapter in kjv.bible[book]:
        for verse in kjv.bible[book][chapter]:
            kjv_body = kjv.bible[book][chapter][verse]
            nkjv_body = nkjv.bible[book][chapter][verse]
            
            # Only print is something is differnt
            if kjv_body != nkjv_body:
                data = '<p><b>[%s %s:%s]</b> %s</p>' % \
                    (book, chapter, verse, html_diff(kjv_body, nkjv_body))
                f.write(data)
    f.close()