Example #1
0
    def convert_latin_chars(self, strings):
        """
        Check for latin similar characters mixed with Greek and convert them to
        Greek.

        """
        debug = False
        strings = [strings] if not isinstance(strings, list) else strings
        if debug: print(strings)
        try:
            newstrings = []
            rgx = r'(?P<a>[Α-Ωα-ω\u1F00-\u1FFF])?([a-z]|[A-Z]|\d|\?)+(?(a).*|[Α-Ωα-ω\u1F00-\u1FFF])'
            latin = re.compile(rgx)
            for string in strings:
                if debug: print('trying', string)
                mymatch = re.search(latin, string)
                if debug: print('match result:', mymatch)
                if not mymatch:
                    newstring = string
                else:
                    subs = {'a': 'α',  # y
                            'A': 'Α',  # y
                            'd': 'δ',  # y
                            'e': 'ε',  # y
                            'E': 'Ε',
                            'Z': 'Ζ',
                            'H': 'Η',
                            'i': 'ι',  # y
                            'I': 'Ι',
                            'k': 'κ',
                            'K': 'Κ',  # y
                            'v': 'ν',  # y
                            'N': 'Ν',
                            'o': 'ο',  # y
                            'O': 'Ο',  # y
                            'p': 'ρ',  # y
                            'P': 'Ρ',  # y
                            't': 'τ',  # y
                            'T': 'Τ',  # y
                            'Y': 'Υ',
                            'x': 'χ',
                            'X': 'Χ',  # y
                            'w': 'ω',  # y
                            '?': ';'}
                    if debug: print('Latin character found in Greek string: ')
                    if debug: print(mymatch.group(), 'in', string)
                    if debug: print(bytes(mymatch.group(), 'utf8'), 'in', bytes(string, 'utf8'))
                    newstring = multiple_replace(string, subs)
                    if debug: print('replaced with Greek characters:')
                    if debug: print(newstring)
                    if debug: print(bytes(newstring, 'utf8'))
                newstrings.append(newstring)
            if len(newstrings) == 1:
                newstrings = newstrings[0]
            return newstrings
        except Exception:
            print(traceback.format_exc(12))
            return False
def test_multiple_replace(string_in, equivs, string_out):
    """
    Unit test for multiple_replace() utility function.
    """
    actual = plugin_utils.multiple_replace(string_in, equivs)
    print 'string in', string_in
    print 'actual', actual
    print 'expected', string_out
    print 'equivs', equivs
    assert actual == string_out
Example #3
0
def test_multiple_replace(string_in, equivs, string_out):
    """
    Unit test for multiple_replace() utility function.
    """
    actual = plugin_utils.multiple_replace(string_in, equivs)
    print 'string in', string_in
    print 'actual', actual
    print 'expected', string_out
    print 'equivs', equivs
    assert actual == string_out
Example #4
0
    def convert_latin_chars(self, strings):
        """
        Check for latin similar characters mixed with Greek and convert them to
        Greek.

        """
        debug = False
        strings = [strings] if not isinstance(strings, list) else strings
        try:
            newstrings = []
            rgx = ur'(?P<a>[Α-Ωα-ω])?([a-z]|[A-Z]|\d|\?)(?(a).*|[Α-Ωα-ω])'
            latin = re.compile(rgx, re.U)
            for string in strings:
                string = to_unicode(string)
                mymatch = re.search(latin, string)
                if not mymatch:
                    newstring = string
                else:
                    subs = {u'a': u'α',  # y
                            u'A': u'Α',  # n
                            u'd': u'δ',  # y
                            u'e': u'ε',  # y
                            u'E': u'Ε',
                            u'Z': u'Ζ',
                            u'H': u'Η',
                            u'i': u'ι',  # y
                            u'I': u'Ι',
                            u'k': u'κ',
                            u'K': u'Κ',  # y
                            u'v': u'ν',  # y
                            u'N': u'Ν',
                            u'o': u'ο',  # y
                            u'O': u'Ο',  # ΝΝΝ
                            u'p': u'ρ',  # y
                            u'P': u'Ρ',  # y
                            u't': u'τ',  # y
                            u'T': u'Τ',  # y
                            u'Y': u'Υ',
                            u'x': u'χ',
                            u'X': u'Χ',  # y
                            u'w': u'ω',  # y
                            u'?': u';'}
                    if debug: print 'Latin character found in Greek string: '
                    if debug: print mymatch.group(), 'in', to_bytes(string)
                    newstring = multiple_replace(string, subs)
                    if debug: print 'replaced with Greek characters:'
                    if debug: print to_bytes(newstring)
                newstrings.append(newstring)
            if len(newstrings) == 1:
                newstrings = newstrings[0]
            return newstrings
        except Exception:
            print traceback.format_exc(12)
            return False
Example #5
0
    def normalize_accents(self, strings):
        """
        Return a polytonic Greek unicode string with accents removed.

        The one argument should be a list of strings to be normalized. It can
        also handle a single string.

        """
        debug = False
        instrings = [strings] if not isinstance(strings, list) else strings

        outstrings = []
        for string in instrings:
            substrs = to_unicode(string).split(' ')

            equivs = {u'α': [u'ά', u'ὰ', u'ᾶ'],
                      u'Α': [u'Ά', u'Ὰ'],  # caps
                      u'ἀ': [u'ἄ', u'ἂ', u'ἆ'],
                      u'Ἀ': [u'Ἄ', u'Ἂ', u'Ἆ', u'᾿Α'],  # caps (including combining )
                      u'ἁ': [u'ἅ', u'ἃ', u'ἇ'],
                      u'Ἁ': [u'Ἅ', u'Ἃ', u'Ἇ', u'῾Α'],  # caps (including combining)
                      u'ᾳ': [u'ᾷ', u'ᾲ', u'ᾴ'],
                      u'ᾀ': [u'ᾄ', u'ᾂ', u'ᾆ'],
                      u'ᾁ': [u'ᾅ', u'ᾃ', u'ᾇ'],
                      u'ε': [u'έ', u'ὲ'],
                      u'Ε': [u'Έ', u'Ὲ'],  # caps
                      u'ἐ': [u'ἔ', u'ἒ'],
                      u'Ἐ': [u'Ἔ', u'Ἒ', u'᾿Ε'],  # caps (including combining)
                      u'ἑ': [u'ἕ', u'ἓ'],
                      u'Ἑ': [u'Ἕ', u'Ἓ', u'῾Ε'],  # caps (including combining)
                      u'η': [u'ῆ', u'ή', u'ὴ'],
                      u'Η': [u'Ή', u'Ὴ'],  # caps
                      u'ἠ': [u'ἤ', u'ἢ', u'ἦ'],
                      u'Ἠ': [u'Ἤ', u'Ἢ', u'Ἦ', u'᾿Η'],  # caps (including combining)
                      u'ἡ': [u'ἥ', u'ἣ', u'ἧ'],
                      u'Ἡ': [u'Ἥ', u'Ἣ', u'Ἧ', u'῾Η'],  # caps (including combining)
                      u'ῃ': [u'ῇ', u'ῄ', u'ῂ'],
                      u'ᾐ': [u'ᾔ', u'ᾒ', u'ᾖ'],
                      u'ᾑ': [u'ᾕ', u'ᾓ', u'ᾗ'],
                      u'ι': [u'ῖ', u'ϊ', u'ί', u'ὶ', u'ί'],
                      u'ἰ': [u'ἴ', u'ἲ', u'ἶ'],
                      u'ἱ': [u'ἵ', u'ἳ', u'ἷ'],
                      u'Ι': [u'Ϊ', u'Ί', u'Ὶ', u'Ί'],  # caps
                      u'Ἰ': [u'Ἴ', u'Ἲ', u'Ἶ', u'᾿Ι'],  # caps (including combining)
                      u'Ἱ': [u'Ἵ', u'Ἳ', u'Ἷ', u'῾Ι'],  # caps (including combining)
                      u'ο': [u'ό', u'ὸ'],
                      u'ὀ': [u'ὄ', u'ὂ'],
                      u'ὁ': [u'ὅ', u'ὃ'],
                      u'Ο': [u'Ό', u'Ὸ'],  # caps
                      u'Ὀ': [u'Ὄ', u'Ὂ', u'᾿Ο'],  # caps (including combining)
                      u'Ὁ': [u'Ὅ', u'Ὃ', u'῾Ο'],  # caps (including combining)
                      u'υ': [u'ῦ', u'ϋ', u'ύ', u'ὺ'],
                      u'ὐ': [u'ὔ', u'ὒ', u'ὖ'],
                      u'ὑ': [u'ὕ', u'ὓ', u'ὗ'],
                      u'Υ': [u'Ϋ', u'Ύ', u'Ὺ'],  # caps TODO: no capital U with smooth?
                      u'Ὑ': [u'Ὕ', u'Ὓ', u'Ὗ', u'῾Υ'],  # caps (including combining)
                      u'ω': [u'ῶ', u'ώ', u'ὼ'],
                      u'ὠ': [u'ὤ', u'ὢ', u'ὦ'],
                      u'ὡ': [u'ὥ', u'ὣ', u'ὧ'],
                      u'Ω': [u'Ώ', u'Ὼ'],  # caps
                      u'Ὠ': [u'Ὤ', u'Ὢ', u'Ὦ', u'᾿Ω'],  # caps (including combining)
                      u'Ὡ': [u'Ὥ', u'Ὣ', u'Ὧ', u'῾Ω'],  # caps (including combining)
                      u'ῳ': [u'ῷ', u'ῴ', u'ῲ'],
                      u'ᾠ': [u'ᾤ', u'ᾢ', u'ᾦ'],
                      u'ᾡ': [u'ᾥ', u'ᾣ', u'ᾧ'],
                      u'Ῥ': [u'῾Ρ'],  # also handle improperly formed marks (rough)
                      u'"': [u'“', u'”', u'«', u'»'],  # handle curly quotes
                      u"'": [u'‘', u'’'],
                      }
            accented = chain(*equivs.values())
            restr = '|'.join(accented)
            newstrings = []
            # FIXME: this is ugly and conflicts with question mark conversion
            exempt = [u'τίνος', u'τί', u'τίς', u'τίνα', u'τίνας', u'τίνι',
                      u'Τίνος', u'Τί', u'Τίς', u'Τίνα', u'Τίνας', u'Τίνι']
            ex_period = [x + u'.' for x in exempt]
            ex_scolon = [x + u';' for x in exempt]
            ex_anotel = [x + u'·' for x in exempt]
            ex_comma = [x + u',' for x in exempt]
            ex_qmark = [x + u'\?' for x in exempt]
            ex_colon = [x + u':' for x in exempt]
            exempt = list(chain(exempt, ex_colon, ex_comma, ex_qmark, ex_scolon,
                          ex_period, ex_anotel))

            for mystring in substrs:
                latin_chars = re.compile(r'^[a-zA-Z\s\.,:;\'\"\?]+$', re.U)
                islatin = re.match(latin_chars, mystring)
                if debug: print 'substring:', to_bytes(mystring)
                if debug: print 'islatin:', islatin
                if not islatin:
                    mystring = mystring.strip()
                    if debug: print '1:', to_bytes(mystring)
                    mystring = mystring.replace(u'ί', u'ί')  # avoid q-i iota on windows
                    if debug: print '2:', to_bytes(mystring)

                    if mystring not in exempt:
                        # below print statement causes UnicodeEncodeError on live server
                        # print mystring, 'not exempt', type(mystring)
                        matching_letters = re.findall(to_unicode(restr), mystring,
                                                    re.I | re.U)
                        if debug: print 'matching letters:', to_bytes(matching_letters)
                        if matching_letters:

                            edict = {k: v for k, v in equivs.iteritems()
                                    if [m for m in v if m in matching_letters]}
                            key_vals = {ltr: k
                                        for ltr in list(chain(*edict.values()))
                                        for k in edict.keys()
                                        if ltr in edict[k]}
                            if debug: print key_vals
                            mystring = multiple_replace(mystring, key_vals)
                        else:
                            if debug: print 'no matching letters'
                    else:
                        if debug: print to_bytes(mystring), 'exempt'
                else:
                    if debug: print 'no Greek'
                newstrings.append(mystring)
            if debug: print '3'
            newstring = ' '.join(newstrings)
            if debug: print '4'
            outstrings.append(newstring)
            if debug: print '5'
        if len(outstrings) == 1:
            outstrings = outstrings[0]
        if debug: print 'returning', to_bytes(outstrings)
        return outstrings
Example #6
0
    def normalize_accents(self, strings):
        """
        Return a polytonic Greek unicode string with accents removed.

        The one argument should be a list of strings to be normalized. It can
        also handle a single string.

        """
        # TODO: use normalization library as described here:
        # https://stackoverflow.com/questions/23346506/javascript-normalize-accented-greek-characters
        debug = False
        instrings = [strings] if not isinstance(strings, list) else strings

        outstrings = []
        for string in instrings:
            substrs = string.split(' ')

            equivs = {'α': ['ά', 'ὰ', 'ᾶ'],
                      'Α': ['Ά', 'Ὰ'],  # caps
                      'ἀ': ['ἄ', 'ἂ', 'ἆ'],
                      'Ἀ': ['Ἄ', 'Ἂ', 'Ἆ', '᾿Α'],  # caps (including combining )
                      'ἁ': ['ἅ', 'ἃ', 'ἇ'],
                      'Ἁ': ['Ἅ', 'Ἃ', 'Ἇ', '῾Α'],  # caps (including combining)
                      'ᾳ': ['ᾷ', 'ᾲ', 'ᾴ'],
                      'ᾀ': ['ᾄ', 'ᾂ', 'ᾆ'],
                      'ᾁ': ['ᾅ', 'ᾃ', 'ᾇ'],
                      'ε': ['έ', 'ὲ'],
                      'Ε': ['Έ', 'Ὲ'],  # caps
                      'ἐ': ['ἔ', 'ἒ'],
                      'Ἐ': ['Ἔ', 'Ἒ', '᾿Ε'],  # caps (including combining)
                      'ἑ': ['ἕ', 'ἓ'],
                      'Ἑ': ['Ἕ', 'Ἓ', '῾Ε'],  # caps (including combining)
                      'η': ['ῆ', 'ή', 'ὴ'],
                      'Η': ['Ή', 'Ὴ'],  # caps
                      'ἠ': ['ἤ', 'ἢ', 'ἦ'],
                      'Ἠ': ['Ἤ', 'Ἢ', 'Ἦ', '᾿Η'],  # caps (including combining)
                      'ἡ': ['ἥ', 'ἣ', 'ἧ'],
                      'Ἡ': ['Ἥ', 'Ἣ', 'Ἧ', '῾Η'],  # caps (including combining)
                      'ῃ': ['ῇ', 'ῄ', 'ῂ'],
                      'ᾐ': ['ᾔ', 'ᾒ', 'ᾖ'],
                      'ᾑ': ['ᾕ', 'ᾓ', 'ᾗ'],
                      'ι': ['ῖ', 'ϊ', 'ί', 'ὶ', 'ί'],
                      'ἰ': ['ἴ', 'ἲ', 'ἶ'],
                      'ἱ': ['ἵ', 'ἳ', 'ἷ'],
                      'Ι': ['Ϊ', 'Ί', 'Ὶ', 'Ί'],  # caps
                      'Ἰ': ['Ἴ', 'Ἲ', 'Ἶ', '᾿Ι'],  # caps (including combining)
                      'Ἱ': ['Ἵ', 'Ἳ', 'Ἷ', '῾Ι'],  # caps (including combining)
                      'ο': ['ό', 'ὸ'],
                      'ὀ': ['ὄ', 'ὂ'],
                      'ὁ': ['ὅ', 'ὃ'],
                      'Ο': ['Ό', 'Ὸ'],  # caps
                      'Ὀ': ['Ὄ', 'Ὂ', '᾿Ο'],  # caps (including combining)
                      'Ὁ': ['Ὅ', 'Ὃ', '῾Ο'],  # caps (including combining)
                      'υ': ['ῦ', 'ϋ', 'ύ', 'ὺ'],
                      'ὐ': ['ὔ', 'ὒ', 'ὖ'],
                      'ὑ': ['ὕ', 'ὓ', 'ὗ'],
                      'Υ': ['Ϋ', 'Ύ', 'Ὺ'],  # caps TODO: no capital U with smooth?
                      'Ὑ': ['Ὕ', 'Ὓ', 'Ὗ', '῾Υ'],  # caps (including combining)
                      'ω': ['ῶ', 'ώ', 'ὼ'],
                      'ὠ': ['ὤ', 'ὢ', 'ὦ'],
                      'ὡ': ['ὥ', 'ὣ', 'ὧ'],
                      'Ω': ['Ώ', 'Ὼ'],  # caps (including combining)
                      'Ὠ': ['Ὤ', 'Ὢ', 'Ὦ', '᾿Ω'],  # caps (including combining)
                      'Ὡ': ['Ὥ', 'Ὣ', 'Ὧ', '῾Ω'],  # caps (including combining)
                      'ῳ': ['ῷ', 'ῴ', 'ῲ'],
                      'ᾠ': ['ᾤ', 'ᾢ', 'ᾦ'],
                      'ᾡ': ['ᾥ', 'ᾣ', 'ᾧ'],
                      'Ῥ': ['῾Ρ'],  # also handle improperly formed marks (rough)
                      '"': ['“', '”', '«', '»'],  # handle curly quotes
                      "'": ['‘', '’'],
                      }
            accented = chain(*list(equivs.values()))
            restr = '|'.join(accented)
            newstrings = []
            # FIXME: this is ugly and conflicts with question mark conversion
            exempt = ['τίνος', 'τί', 'τίς', 'τίνα', 'τίνας', 'τίνι',
                      'Τίνος', 'Τί', 'Τίς', 'Τίνα', 'Τίνας', 'Τίνι']
            ex_period = [x + '.' for x in exempt]
            ex_scolon = [x + ';' for x in exempt]
            ex_anotel = [x + '·' for x in exempt]
            ex_comma = [x + ',' for x in exempt]
            ex_qmark = [x + '?' for x in exempt]
            ex_colon = [x + ':' for x in exempt]
            exempt = list(chain(exempt, ex_colon, ex_comma, ex_qmark, ex_scolon,
                          ex_period, ex_anotel))

            for mystring in substrs:
                latin_chars = re.compile(r'^[a-zA-Z\s\.,:;\'\"\?]+$')
                islatin = re.match(latin_chars, mystring)
                if debug: print('substring:', mystring)
                if debug: print('islatin:', islatin)
                if not islatin:
                    mystring = mystring.strip()
                    if debug: print('1:', mystring)
                    mystring = mystring.replace('ί', 'ί')  # avoid q-i iota on windows
                    if debug: print('2:', mystring)

                    if mystring not in exempt:
                        # below print statement causes UnicodeEncodeError on live server
                        # print mystring, 'not exempt', type(mystring)
                        matching_letters = re.findall(restr, mystring,
                                                      re.I)
                        if debug: print('matching letters:', matching_letters)
                        if matching_letters:
                            edict = {k: v for k, v in list(equivs.items())
                                    if [m for m in v if m in matching_letters]}
                            key_vals = {ltr: k
                                        for ltr in list(chain(*list(edict.values())))
                                        for k in list(edict.keys())
                                        if ltr in edict[k]}
                            if debug: print(key_vals)
                            mystring = multiple_replace(mystring, key_vals)
                            if debug: print('after replacing:' + mystring)
                        else:
                            if debug: print('no matching letters')
                    else:
                        if debug: print(mystring, 'exempt')
                else:
                    if debug: print('no Greek')
                newstrings.append(mystring)
            if debug: print('3' + str(newstrings))
            newstring = ' '.join(newstrings)
            if debug: print('4' + newstring)
            outstrings.append(newstring)
            if debug: print('5' + str(outstrings))
        if len(outstrings) == 1:
            outstrings = outstrings[0]
        if debug: print('returning', outstrings)
        return outstrings