def _is_change_significant(self, original_word, input_word): # still using normalization on the original text. With enough modification of the original text, # this won't be necessary original_word_normalised = textprocess.normalize(original_word) input_word_normalised = textprocess.normalize(input_word) differ = difflib.Differ() diffs = list(differ.compare([original_word_normalised], [input_word_normalised])) for i, diff in enumerate(diffs): if diff.startswith('? '): for j, c in enumerate(diff[2:]): if c == '+' or c == '?' or (c == '-' and unicodedata.category(diffs[i-1][j]) != 'Mn'): return True return False
def _is_change_significant(self, original_word, input_word): # still using normalization on the original text. With enough modification of the original text, # this won't be necessary original_word_normalised = textprocess.normalize(original_word) input_word_normalised = textprocess.normalize(input_word) differ = difflib.Differ() diffs = list( differ.compare([original_word_normalised], [input_word_normalised])) for i, diff in enumerate(diffs): if diff.startswith('? '): for j, c in enumerate(diff[2:]): if c == '+' or c == '?' or ( c == '-' and unicodedata.category(diffs[i - 1][j]) != 'Mn'): return True return False
def test_normalize_function(): import sqlite3 import qurantextdiff.helpers.textprocess as textprocess connection = sqlite3.connect('db.sqlite3') cursor = connection.cursor() cursor.execute('SELECT * FROM quran_diacritic') diacritic_verses = [] surah_no = [] verse_no = [] for row in cursor.fetchall(): surah_no.append(row[1]) verse_no.append(row[2]) diacritic_verses.append(row[3]) cursor.execute('SELECT * FROM quran_non_diacritic') non_diacritic_verses = [row[3] for row in cursor.fetchall()] diacritic_verses_normalized = textprocess.normalize(diacritic_verses) mismatches = [] mismatch_chars = [] for s_no, v_no, diacNormal, diac in zip(surah_no, verse_no, diacritic_verses_normalized, diacritic_verses): try: assert diacNormal == diac except AssertionError: for c1, c2 in zip(diacNormal, diac): if c1 != c2: if c1 not in mismatch_chars: mismatch_chars.append(c1) if c2 not in mismatch_chars: mismatch_chars.append(c2) mismatches.append((s_no, v_no, diacNormal, diac)) file = open('mismatches.txt', 'w', encoding='utf-8') for s, v, d, n in mismatches: print('{}:{}\n{}\n{}\n'.format(s, v, d, n), file=file) import unicodedata as ud print(len(mismatch_chars)) print('\n'.join(mismatch_chars)) for mc in mismatch_chars: print(ud.name(mc))
def test_normalize_function(): import sqlite3 import qurantextdiff.helpers.textprocess as textprocess connection = sqlite3.connect("db.sqlite3") cursor = connection.cursor() cursor.execute("SELECT * FROM quran_diacritic") diacritic_verses = [] surah_no = [] verse_no = [] for row in cursor.fetchall(): surah_no.append(row[1]) verse_no.append(row[2]) diacritic_verses.append(row[3]) cursor.execute("SELECT * FROM quran_non_diacritic") non_diacritic_verses = [row[3] for row in cursor.fetchall()] diacritic_verses_normalized = textprocess.normalize(diacritic_verses) mismatches = [] mismatch_chars = [] for s_no, v_no, diacNormal, diac in zip(surah_no, verse_no, diacritic_verses_normalized, diacritic_verses): try: assert diacNormal == diac except AssertionError: for c1, c2 in zip(diacNormal, diac): if c1 != c2: if c1 not in mismatch_chars: mismatch_chars.append(c1) if c2 not in mismatch_chars: mismatch_chars.append(c2) mismatches.append((s_no, v_no, diacNormal, diac)) file = open("mismatches.txt", "w", encoding="utf-8") for s, v, d, n in mismatches: print("{}:{}\n{}\n{}\n".format(s, v, d, n), file=file) import unicodedata as ud print(len(mismatch_chars)) print("\n".join(mismatch_chars)) for mc in mismatch_chars: print(ud.name(mc))
def _create_tooltip(self, s1, s2): # still using normalisation on the original text. With enough modification of the original text, # this won't be necessary # inp_word in already normalised tooltip = [] inserts, deletes, replaces, displacements = [], [], [], [] sm = difflib.SequenceMatcher(None, textprocess.normalize(s1), s2) for tag, i1, i2, j1, j2 in sm.get_opcodes(): if tag == 'insert': inserts.append([s2[j1:j2], i1, False]) elif tag == 'delete': deletes.append([s1[i1:i2], i1, False]) elif tag == 'replace': replaces.append([s1[i1:i2], s2[j1:j2], i1]) for d in deletes: for i in inserts: if d[0] == i[0] and d[2] is False and i[2] is False: displacements.append((d[0], d[1], i[1])) d[2] = True i[2] = True break for chrs, f, t in displacements: tooltip.append('Displaced {} from {} to {}'.format( chrs, f + 1, t + 1)) for d in deletes: if d[2] is False: tooltip.append('Deleted {} at {}'.format(d[0], d[1])) for i in inserts: if i[2] is False: tooltip.append('Inserted {} at {}'.format(i[0], i[1])) for r in replaces: tooltip.append('Replaced {} with {} at {}'.format( r[0], r[1], r[2])) return '\n'.join(tooltip)
def _create_tooltip(self, s1, s2): # still using normalisation on the original text. With enough modification of the original text, # this won't be necessary # inp_word in already normalised tooltip = [] inserts, deletes, replaces, displacements = [], [], [], [] sm = difflib.SequenceMatcher(None, textprocess.normalize(s1), s2) for tag, i1, i2, j1, j2 in sm.get_opcodes(): if tag == 'insert': inserts.append([s2[j1:j2], i1, False]) elif tag == 'delete': deletes.append([s1[i1:i2], i1, False]) elif tag == 'replace': replaces.append([s1[i1:i2], s2[j1:j2], i1]) for d in deletes: for i in inserts: if d[0] == i[0] and d[2] is False and i[2] is False: displacements.append((d[0], d[1], i[1])) d[2] = True i[2] = True break for chrs, f, t in displacements: tooltip.append('Displaced {} from {} to {}'.format(chrs, f + 1, t + 1)) for d in deletes: if d[2] is False: tooltip.append('Deleted {} at {}'.format(d[0], d[1])) for i in inserts: if i[2] is False: tooltip.append('Inserted {} at {}'.format(i[0], i[1])) for r in replaces: tooltip.append('Replaced {} with {} at {}'.format(r[0], r[1], r[2])) return '\n'.join(tooltip)
def __init__(self, original_lines, input_lines): assert len(original_lines) == len(input_lines) self.original_lines = original_lines self.input_lines = input_lines self.input_lines_normalized = textprocess.normalize(input_lines)