Ejemplo n.º 1
0
    def _is_change_significant(self, original_word, input_word):
        # still using normalization on the original text. With enough modification of the original text,
        # this won't be necessary
        original_word_normalised = textprocess.normalize(original_word)
        input_word_normalised = textprocess.normalize(input_word)

        differ = difflib.Differ()
        diffs = list(differ.compare([original_word_normalised], [input_word_normalised]))

        for i, diff in enumerate(diffs):
            if diff.startswith('? '):
                for j, c in enumerate(diff[2:]):
                    if c == '+' or c == '?' or (c == '-' and unicodedata.category(diffs[i-1][j]) != 'Mn'):
                        return True
        return False
Ejemplo n.º 2
0
    def _is_change_significant(self, original_word, input_word):
        # still using normalization on the original text. With enough modification of the original text,
        # this won't be necessary
        original_word_normalised = textprocess.normalize(original_word)
        input_word_normalised = textprocess.normalize(input_word)

        differ = difflib.Differ()
        diffs = list(
            differ.compare([original_word_normalised],
                           [input_word_normalised]))

        for i, diff in enumerate(diffs):
            if diff.startswith('? '):
                for j, c in enumerate(diff[2:]):
                    if c == '+' or c == '?' or (
                            c == '-'
                            and unicodedata.category(diffs[i - 1][j]) != 'Mn'):
                        return True
        return False
Ejemplo n.º 3
0
def test_normalize_function():
    import sqlite3
    import qurantextdiff.helpers.textprocess as textprocess

    connection = sqlite3.connect('db.sqlite3')
    cursor = connection.cursor()

    cursor.execute('SELECT * FROM quran_diacritic')
    diacritic_verses = []
    surah_no = []
    verse_no = []
    for row in cursor.fetchall():
        surah_no.append(row[1])
        verse_no.append(row[2])
        diacritic_verses.append(row[3])

    cursor.execute('SELECT * FROM quran_non_diacritic')
    non_diacritic_verses = [row[3] for row in cursor.fetchall()]

    diacritic_verses_normalized = textprocess.normalize(diacritic_verses)

    mismatches = []
    mismatch_chars = []

    for s_no, v_no, diacNormal, diac in zip(surah_no, verse_no,
                                            diacritic_verses_normalized,
                                            diacritic_verses):
        try:
            assert diacNormal == diac
        except AssertionError:
            for c1, c2 in zip(diacNormal, diac):
                if c1 != c2:
                    if c1 not in mismatch_chars:
                        mismatch_chars.append(c1)
                    if c2 not in mismatch_chars:
                        mismatch_chars.append(c2)

            mismatches.append((s_no, v_no, diacNormal, diac))

    file = open('mismatches.txt', 'w', encoding='utf-8')
    for s, v, d, n in mismatches:
        print('{}:{}\n{}\n{}\n'.format(s, v, d, n), file=file)
    import unicodedata as ud
    print(len(mismatch_chars))
    print('\n'.join(mismatch_chars))
    for mc in mismatch_chars:
        print(ud.name(mc))
Ejemplo n.º 4
0
def test_normalize_function():
    import sqlite3
    import qurantextdiff.helpers.textprocess as textprocess

    connection = sqlite3.connect("db.sqlite3")
    cursor = connection.cursor()

    cursor.execute("SELECT * FROM quran_diacritic")
    diacritic_verses = []
    surah_no = []
    verse_no = []
    for row in cursor.fetchall():
        surah_no.append(row[1])
        verse_no.append(row[2])
        diacritic_verses.append(row[3])

    cursor.execute("SELECT * FROM quran_non_diacritic")
    non_diacritic_verses = [row[3] for row in cursor.fetchall()]

    diacritic_verses_normalized = textprocess.normalize(diacritic_verses)

    mismatches = []
    mismatch_chars = []

    for s_no, v_no, diacNormal, diac in zip(surah_no, verse_no, diacritic_verses_normalized, diacritic_verses):
        try:
            assert diacNormal == diac
        except AssertionError:
            for c1, c2 in zip(diacNormal, diac):
                if c1 != c2:
                    if c1 not in mismatch_chars:
                        mismatch_chars.append(c1)
                    if c2 not in mismatch_chars:
                        mismatch_chars.append(c2)

            mismatches.append((s_no, v_no, diacNormal, diac))

    file = open("mismatches.txt", "w", encoding="utf-8")
    for s, v, d, n in mismatches:
        print("{}:{}\n{}\n{}\n".format(s, v, d, n), file=file)
    import unicodedata as ud

    print(len(mismatch_chars))
    print("\n".join(mismatch_chars))
    for mc in mismatch_chars:
        print(ud.name(mc))
Ejemplo n.º 5
0
    def _create_tooltip(self, s1, s2):
        # still using normalisation on the original text. With enough modification of the original text,
        # this won't be necessary
        # inp_word in already normalised
        tooltip = []
        inserts, deletes, replaces, displacements = [], [], [], []

        sm = difflib.SequenceMatcher(None, textprocess.normalize(s1), s2)
        for tag, i1, i2, j1, j2 in sm.get_opcodes():
            if tag == 'insert':
                inserts.append([s2[j1:j2], i1, False])
            elif tag == 'delete':
                deletes.append([s1[i1:i2], i1, False])
            elif tag == 'replace':
                replaces.append([s1[i1:i2], s2[j1:j2], i1])

        for d in deletes:
            for i in inserts:
                if d[0] == i[0] and d[2] is False and i[2] is False:
                    displacements.append((d[0], d[1], i[1]))
                    d[2] = True
                    i[2] = True
                    break

        for chrs, f, t in displacements:
            tooltip.append('Displaced {} from {} to {}'.format(
                chrs, f + 1, t + 1))
        for d in deletes:
            if d[2] is False:
                tooltip.append('Deleted {} at {}'.format(d[0], d[1]))
        for i in inserts:
            if i[2] is False:
                tooltip.append('Inserted {} at {}'.format(i[0], i[1]))
        for r in replaces:
            tooltip.append('Replaced {} with {} at {}'.format(
                r[0], r[1], r[2]))

        return '\n'.join(tooltip)
Ejemplo n.º 6
0
    def _create_tooltip(self, s1, s2):
        # still using normalisation on the original text. With enough modification of the original text,
        # this won't be necessary
        # inp_word in already normalised
        tooltip = []
        inserts, deletes, replaces, displacements = [], [], [], []

        sm = difflib.SequenceMatcher(None, textprocess.normalize(s1), s2)
        for tag, i1, i2, j1, j2 in sm.get_opcodes():
            if tag == 'insert':
                inserts.append([s2[j1:j2], i1, False])
            elif tag == 'delete':
                deletes.append([s1[i1:i2], i1, False])
            elif tag == 'replace':
                replaces.append([s1[i1:i2], s2[j1:j2], i1])

        for d in deletes:
            for i in inserts:
                if d[0] == i[0] and d[2] is False and i[2] is False:
                    displacements.append((d[0], d[1], i[1]))
                    d[2] = True
                    i[2] = True
                    break

        for chrs, f, t in displacements:
            tooltip.append('Displaced {} from {} to {}'.format(chrs, f + 1, t + 1))
        for d in deletes:
            if d[2] is False:
                tooltip.append('Deleted {} at {}'.format(d[0], d[1]))
        for i in inserts:
            if i[2] is False:
                tooltip.append('Inserted {} at {}'.format(i[0], i[1]))
        for r in replaces:
            tooltip.append('Replaced {} with {} at {}'.format(r[0], r[1], r[2]))

        return '\n'.join(tooltip)
Ejemplo n.º 7
0
 def __init__(self, original_lines, input_lines):
     assert len(original_lines) == len(input_lines)
     self.original_lines = original_lines
     self.input_lines = input_lines
     self.input_lines_normalized = textprocess.normalize(input_lines)
Ejemplo n.º 8
0
 def __init__(self, original_lines, input_lines):
     assert len(original_lines) == len(input_lines)
     self.original_lines = original_lines
     self.input_lines = input_lines
     self.input_lines_normalized = textprocess.normalize(input_lines)