Esempio n. 1
0
            # turn the string into a list
            warpa_ph = arpa_phon_str.split()

        except Exception, e:
            warpa_ph = ["NO MATCH"]

        return warpa_ph


# ----------------------------------------------------------------------------

    context = {}

    # ---- Long Text Box --------------------------------------------
    form1 = EnglishInputForm()
    context['form_1'] = form1

    # if request.method == 'GET':
    #     input_string = request.GET.get('input', '')

    if request.method == 'POST':
        form1 = EnglishInputForm(request.POST)
        context['form_1'] = form1

        if form1.is_valid():
            # read data from form
            input_string = form1.cleaned_data['english_input_f']
            context['english_input_v'] = input_string

            # Make a map that shows capitals, miniscules, apostrophes, and non-letters
            # char_to_remove = ['0', '1', '2']
            # arpa_phon_clean = arpa_phon_str.translate(None, ''.join(char_to_remove))

            # turn the string into a list
            warpa_ph = arpa_phon_str.split()

        except Exception, e:
            warpa_ph = ["NO MATCH"]

        return warpa_ph

# ----------------------------------------------------------------------------
    context = {}

    # ---- Long Text Box --------------------------------------------
    form1 = EnglishInputForm()
    context['form_1'] = form1

    if request.method == 'POST':
        form1 = EnglishInputForm(request.POST)
        context['form_1'] = form1

        if form1.is_valid():
            # read data from form
            input_string = form1.cleaned_data['english_input_f']
            context['english_input_v'] = input_string

            # Make a map that shows capitals, miniscules, apostrophes, and non-letters
            input_length = len(input_string)

# ----------------------------------------------------------------------------
Esempio n. 3
0
def input_output(request):

    context = {}

    # ---- Long Text Box --------------------------------------------
    form1 = EnglishInputForm()
    context['form_1'] = form1

    if request.method == 'POST':
        form1 = EnglishInputForm(request.POST)
        context['form_1'] = form1

        if form1.is_valid():
            # read data from form
            input_string = form1.cleaned_data['english_input_f']
            context['english_input_v'] = input_string

            # Make a map that shows capitals, miniscules, apostrophes, and non-letters
            input_length = len(input_string)

            input_map1 = []
            for i in range(0, input_length):
                # test if char is a capital letter and, thus, part of a word
                if ord(input_string[i]) > 64 and ord(input_string[i]) < 91:
                    input_map1.append("C")
                # test if char is a miniscule letter and, thus, part of a word
                elif ord(input_string[i]) > 96 and ord(input_string[i]) < 123:
                    input_map1.append("m")
                # test for apostrophes (which will be dealt with below)
                elif input_string[i] == "'":
                    input_map1.append("a")
                # for now we'll say everything else is not part of a word
                # we'll probably have to deal with hyphens at some point
                else:
                    input_map1.append(input_string[i])

            # Test if char is a non-terminal apostrophe (and, thus, part
            # of a word) or a single-quote mark or terminal apostrophe
            # (and, thus, not part of a word)
            for i in range(0, input_length):
                # find the apostrophe marks
                if input_map1[i] == "a":
                    # test if apostrophe starts or ends the string
                    if i == 0 or i == input_length - 1:
                        input_map1[i] = "'"
                    # test if apostrophe is part of a word (ie, has letters on both sides)
                    elif input_map1[i - 1] in (
                            "C", "m") and input_map1[i + 1] in ("C", "m"):
                        input_map1[i] = input_map1[i + 1]
                    # every other apostrophe is not part of a word
                    else:
                        input_map1[i] = "'"
            print "input_map1 (joined): %s" % ''.join(input_map1)

            # Finalize the map, categorizing words as all-caps (CC),
            # initial-cap (Cm), and no-caps (mm)
            # Also make a list of words to be transliterated.
            input_map2 = []
            word_index = []
            for i in range(0, input_length):
                # put non-words directly into the final map
                if input_map1[i] not in ("C", "m"):
                    input_map2.append(input_map1[i])
                    # record index for the end of a word
                    # print "im1[i-1]: %s" % input_map1[i-1]
                    if i == 0:
                        pass
                    elif input_map1[i - 1] in ("C", "m"):
                        word_index.append(i)
                # everything else is part of a word
                else:
                    # find first-letters and put them in map
                    # record the index for the start of the word
                    if i == 0 or input_map1[i - 1] not in ("C", "m"):
                        input_map2.append(input_map1[i])
                        word_index.append(i)
                    # find second-letters (if present) and put them in map
                    elif i == 1 and input_map1[0] in ("C", "m"):
                        input_map2[0] = input_map1[0] + input_map1[1]
                    elif input_map1[i - 2] not in (
                            "C", "m") and input_map1[i - 1] in ("C", "m"):
                        cur_index = len(input_map2) - 1
                        let_1 = input_map2[cur_index]
                        let_2 = input_map1[i]
                        input_map2[cur_index] = let_1 + let_2

                # Check if last char of string is a letter and make it end of word
                if i == len(input_map1) - 1 and input_map1[i] in ("C", "m"):
                    word_index.append(i + 1)
                # ignore all other letters [remember that at this stage
                # we're ignoring the possibility of mixing upper and
                # lower case letters except init caps]

            print "input_map2 (joined): %s" % ''.join(input_map2)
            print "word_index: %s" % word_index

            # generate list of words to be transliterated
            word_list = []
            word_count = len(word_index)
            for i in range(0, word_count, 2):
                start = word_index[i]
                stop = word_index[i + 1]
                word_list.append(input_string[start:stop])
            print "word_list: %s" % word_list

            # Get Arpabet correlations for each word in word_list
            arpa_phonemes = {}
            word_num = 0
            for word in word_list:
                # Since the word might not be in the dictionary, we
                # try to fail softly
                try:
                    arpa_output = DictCMU.objects.get(entry__iexact=word)

                    # the db stores pronunc info as unicode; we force it to str
                    arpa_phon_str = str(arpa_output.phonemes)

                    # remove accent characters
                    char_to_remove = ['0', '1', '2']
                    arpa_phon_clean = arpa_phon_str.translate(
                        None, ''.join(char_to_remove))

                    # turn the string into a list
                    arpa_phonemes[word] = arpa_phon_clean.split()

                except Exception, e:
                    arpa_phonemes[word] = "NO MATCH"

                word_num += 1

            print "arpa_phonemes: %s" % arpa_phonemes

            # The correlation between the Arpabet and the Deseret script.
            # Note: This is a provisional correlation for early proof-of-concept
            # work; it has serious flaws in the correlations.
            arpa_corr_U = {
                'IY': u'\uD801\uDC00',
                'EY': u'\uD801\uDC01',
                'AA': u'\uD801\uDC02',
                'AO': u'\uD801\uDC03',
                'OW': u'\uD801\uDC04',
                'UW': u'\uD801\uDC05',
                'IH': u'\uD801\uDC06',
                'EH': u'\uD801\uDC07',
                'AE': u'\uD801\uDC08',
                'AH': u'\uD801\uDC0A',
                'UH': u'\uD801\uDC0B',
                'AY': u'\uD801\uDC0C',
                'AW': u'\uD801\uDC0D',
                'W': u'\uD801\uDC0E',
                'Y': u'\uD801\uDC0F',
                'HH': u'\uD801\uDC10',
                'P': u'\uD801\uDC11',
                'B': u'\uD801\uDC12',
                'T': u'\uD801\uDC13',
                'D': u'\uD801\uDC14',
                'CH': u'\uD801\uDC15',
                'JH': u'\uD801\uDC16',
                'K': u'\uD801\uDC17',
                'G': u'\uD801\uDC18',
                'F': u'\uD801\uDC19',
                'V': u'\uD801\uDC1A',
                'TH': u'\uD801\uDC1B',
                'DH': u'\uD801\uDC1C',
                'S': u'\uD801\uDC1D',
                'Z': u'\uD801\uDC1E',
                'SH': u'\uD801\uDC1F',
                'ZH': u'\uD801\uDC20',
                'R': u'\uD801\uDC21',
                'L': u'\uD801\uDC22',
                'M': u'\uD801\uDC23',
                'N': u'\uD801\uDC24',
                'NG': u'\uD801\uDC25',
                'OY': u'\uD801\uDC26',
                'ER': u'\uD801\uDC21'
            }
            arpa_corr_m = {
                'IY': u'\uD801\uDC28',
                'EY': u'\uD801\uDC29',
                'AA': u'\uD801\uDC2A',
                'AO': u'\uD801\uDC2B',
                'OW': u'\uD801\uDC2C',
                'UW': u'\uD801\uDC2D',
                'IH': u'\uD801\uDC2E',
                'EH': u'\uD801\uDC2F',
                'AE': u'\uD801\uDC30',
                'AH': u'\uD801\uDC32',
                'UH': u'\uD801\uDC33',
                'AY': u'\uD801\uDC34',
                'AW': u'\uD801\uDC35',
                'W': u'\uD801\uDC36',
                'Y': u'\uD801\uDC37',
                'HH': u'\uD801\uDC38',
                'P': u'\uD801\uDC39',
                'B': u'\uD801\uDC3A',
                'T': u'\uD801\uDC3B',
                'D': u'\uD801\uDC3C',
                'CH': u'\uD801\uDC3D',
                'JH': u'\uD801\uDC3E',
                'K': u'\uD801\uDC3F',
                'G': u'\uD801\uDC40',
                'F': u'\uD801\uDC41',
                'V': u'\uD801\uDC42',
                'TH': u'\uD801\uDC43',
                'DH': u'\uD801\uDC44',
                'S': u'\uD801\uDC45',
                'Z': u'\uD801\uDC46',
                'SH': u'\uD801\uDC47',
                'ZH': u'\uD801\uDC48',
                'R': u'\uD801\uDC49',
                'L': u'\uD801\uDC4A',
                'M': u'\uD801\uDC4B',
                'N': u'\uD801\uDC4C',
                'NG': u'\uD801\uDC4D',
                'OY': u'\uD801\uDC4E',
                'ER': u'\uD801\uDC49'
            }

            # get unicode codes for Deseret characters that correlate to the Arpabet
            # in arpa_phonemes, taking care to match capital and lower-case letters
            unic_out = []
            w_num = 0
            for inp in input_map2:
                # Check if inp is a word or not. If inp is a word...
                if inp in ("CC", "C", "Cm", "mm", "m"):
                    word = word_list[w_num]
                    arpa_ph = arpa_phonemes[word]

                    # Deal with words not in dictionary
                    if arpa_ph == "NO MATCH":
                        a = "." * len(word)
                        unic_out.append(a)
                        w_num += 1

                    # All-Caps words
                    elif inp == "CC" or inp == "C":
                        for phone in arpa_ph:
                            unic_out.append(arpa_corr_U[phone])
                        w_num += 1

                    # Initial-Caps words
                    elif inp == "Cm":
                        unic_out.append(arpa_corr_U[arpa_ph[0]])
                        for i in range(1, len(arpa_ph)):
                            unic_out.append(arpa_corr_m[arpa_ph[i]])
                        w_num += 1

                    # No-Caps words
                    elif inp == "mm" or inp == "m":
                        for phone in arpa_ph:
                            unic_out.append(arpa_corr_m[phone])
                        w_num += 1

                # If inp is not a word, insert the symbols directly.
                else:
                    unic_out.append(inp)

            # print "unic_out: %s" % unic_out
            context['unic_out'] = unic_out

            unic_str = ''.join(unic_out)
            print "unic_str: %s" % unic_str
            context['unic_str'] = unic_str
def input_output(request):

    context = {}

    # ---- Long Text Box --------------------------------------------
    form1 = EnglishInputForm()
    context['form_1'] = form1

    if request.method == 'POST':
        form1 = EnglishInputForm(request.POST)
        context['form_1'] = form1

        if form1.is_valid():
            # read data from form
            input_string = form1.cleaned_data['english_input_f']
            context['english_input_v'] = input_string

            # Make a map that shows capitals, miniscules, apostrophes, and non-letters
            input_length = len(input_string)

            input_map1 = []
            for i in range(0, input_length):
                # test if char is a capital letter and, thus, part of a word
                if ord(input_string[i]) > 64 and ord(input_string[i]) < 91:
                    input_map1.append("C")
                # test if char is a miniscule letter and, thus, part of a word
                elif ord(input_string[i]) > 96 and ord(input_string[i]) < 123:
                    input_map1.append("m")
                # test for apostrophes (which will be dealt with below)
                elif input_string[i] == "'":
                    input_map1.append("a")
                # for now we'll say everything else is not part of a word
                # we'll probably have to deal with hyphens at some point
                else:
                    input_map1.append(input_string[i])

            # Test if char is a non-terminal apostrophe (and, thus, part
            # of a word) or a single-quote mark or terminal apostrophe
            # (and, thus, not part of a word)
            for i in range(0, input_length):
                # find the apostrophe marks
                if input_map1[i] == "a":
                    # test if apostrophe starts or ends the string
                    if i == 0 or i == input_length - 1:
                        input_map1[i] = "'"
                    # test if apostrophe is part of a word (ie, has letters on both sides)
                    elif input_map1[i-1] in ("C", "m") and input_map1[i+1] in ("C", "m"):
                        input_map1[i] = input_map1[i+1]
                    # every other apostrophe is not part of a word
                    else:
                        input_map1[i] = "'"
            print "input_map1 (joined): %s" % ''.join(input_map1)

            # Finalize the map, categorizing words as all-caps (CC),
            # initial-cap (Cm), and no-caps (mm)
            # Also make a list of words to be transliterated.
            input_map2 = []
            word_index = []
            for i in range(0, input_length):
                # put non-words directly into the final map
                if input_map1[i] not in ("C", "m"):
                    input_map2.append(input_map1[i])
                    # record index for the end of a word
                    # print "im1[i-1]: %s" % input_map1[i-1]
                    if i == 0:
                        pass
                    elif input_map1[i-1] in ("C", "m"):
                        word_index.append(i)
                # everything else is part of a word
                else:
                    # find first-letters and put them in map
                    # record the index for the start of the word
                    if i == 0 or input_map1[i-1] not in ("C", "m"):
                        input_map2.append(input_map1[i])
                        word_index.append(i)
                    # find second-letters (if present) and put them in map
                    elif i == 1 and input_map1[0] in ("C", "m"):
                        input_map2[0] = input_map1[0] + input_map1[1]
                    elif input_map1[i-2] not in ("C", "m") and input_map1[i-1] in ("C", "m"):
                        cur_index = len(input_map2) - 1
                        let_1 = input_map2[cur_index]
                        let_2 = input_map1[i]
                        input_map2[cur_index] = let_1 + let_2

                # Check if last char of string is a letter and make it end of word
                if i == len(input_map1) - 1 and input_map1[i] in ("C", "m"):
                    word_index.append(i+1)
                # ignore all other letters [remember that at this stage
                # we're ignoring the possibility of mixing upper and
                # lower case letters except init caps]

            print "input_map2 (joined): %s" % ''.join(input_map2)
            print "word_index: %s" % word_index

            # generate list of words to be transliterated
            word_list = []
            word_count = len(word_index)
            for i in range(0, word_count, 2):
                start = word_index[i]
                stop = word_index[i+1]
                word_list.append(input_string[start:stop])
            print "word_list: %s" % word_list

            # Get Arpabet correlations for each word in word_list
            arpa_phonemes = {}
            word_num = 0
            for word in word_list:
                # Since the word might not be in the dictionary, we
                # try to fail softly
                try:
                    arpa_output = DictCMU.objects.get(entry__iexact=word)

                    # the db stores pronunc info as unicode; we force it to str
                    arpa_phon_str = str(arpa_output.phonemes)

                    # remove accent characters
                    char_to_remove = ['0', '1', '2']
                    arpa_phon_clean = arpa_phon_str.translate(None, ''.join(char_to_remove))

                    # turn the string into a list
                    arpa_phonemes[word] = arpa_phon_clean.split()

                except Exception, e:
                    arpa_phonemes[word] = "NO MATCH"

                word_num += 1

            print "arpa_phonemes: %s" % arpa_phonemes

            # The correlation between the Arpabet and the Deseret script.
            # Note: This is a provisional correlation for early proof-of-concept
            # work; it has serious flaws in the correlations.
            arpa_corr_U = {'IY': u'\uD801\uDC00', 'EY': u'\uD801\uDC01', 'AA': u'\uD801\uDC02', 'AO': u'\uD801\uDC03', 'OW': u'\uD801\uDC04', 'UW': u'\uD801\uDC05', 'IH': u'\uD801\uDC06', 'EH': u'\uD801\uDC07', 'AE': u'\uD801\uDC08', 'AH': u'\uD801\uDC0A', 'UH': u'\uD801\uDC0B', 'AY': u'\uD801\uDC0C', 'AW': u'\uD801\uDC0D', 'W': u'\uD801\uDC0E', 'Y': u'\uD801\uDC0F', 'HH': u'\uD801\uDC10', 'P': u'\uD801\uDC11', 'B': u'\uD801\uDC12', 'T': u'\uD801\uDC13', 'D': u'\uD801\uDC14', 'CH': u'\uD801\uDC15', 'JH': u'\uD801\uDC16', 'K': u'\uD801\uDC17', 'G': u'\uD801\uDC18', 'F': u'\uD801\uDC19', 'V': u'\uD801\uDC1A', 'TH': u'\uD801\uDC1B', 'DH': u'\uD801\uDC1C', 'S': u'\uD801\uDC1D', 'Z': u'\uD801\uDC1E', 'SH': u'\uD801\uDC1F', 'ZH': u'\uD801\uDC20', 'R': u'\uD801\uDC21', 'L': u'\uD801\uDC22', 'M': u'\uD801\uDC23', 'N': u'\uD801\uDC24', 'NG': u'\uD801\uDC25', 'OY': u'\uD801\uDC26', 'ER': u'\uD801\uDC21'}
            arpa_corr_m = {'IY': u'\uD801\uDC28', 'EY': u'\uD801\uDC29', 'AA': u'\uD801\uDC2A', 'AO': u'\uD801\uDC2B', 'OW': u'\uD801\uDC2C', 'UW': u'\uD801\uDC2D', 'IH': u'\uD801\uDC2E', 'EH': u'\uD801\uDC2F', 'AE': u'\uD801\uDC30', 'AH': u'\uD801\uDC32', 'UH': u'\uD801\uDC33', 'AY': u'\uD801\uDC34', 'AW': u'\uD801\uDC35', 'W': u'\uD801\uDC36', 'Y': u'\uD801\uDC37', 'HH': u'\uD801\uDC38', 'P': u'\uD801\uDC39', 'B': u'\uD801\uDC3A', 'T': u'\uD801\uDC3B', 'D': u'\uD801\uDC3C', 'CH': u'\uD801\uDC3D', 'JH': u'\uD801\uDC3E', 'K': u'\uD801\uDC3F', 'G': u'\uD801\uDC40', 'F': u'\uD801\uDC41', 'V': u'\uD801\uDC42', 'TH': u'\uD801\uDC43', 'DH': u'\uD801\uDC44', 'S': u'\uD801\uDC45', 'Z': u'\uD801\uDC46', 'SH': u'\uD801\uDC47', 'ZH': u'\uD801\uDC48', 'R': u'\uD801\uDC49', 'L': u'\uD801\uDC4A', 'M': u'\uD801\uDC4B', 'N': u'\uD801\uDC4C', 'NG': u'\uD801\uDC4D', 'OY': u'\uD801\uDC4E', 'ER': u'\uD801\uDC49'}

            # get unicode codes for Deseret characters that correlate to the Arpabet
            # in arpa_phonemes, taking care to match capital and lower-case letters
            unic_out = []
            w_num = 0
            for inp in input_map2:
                # Check if inp is a word or not. If inp is a word...
                if inp in ("CC", "C", "Cm", "mm", "m"):
                    word = word_list[w_num]
                    arpa_ph = arpa_phonemes[word]

                    # Deal with words not in dictionary
                    if arpa_ph == "NO MATCH":
                        a = "."*len(word)
                        unic_out.append(a)
                        w_num += 1

                    # All-Caps words
                    elif inp == "CC" or inp == "C":
                        for phone in arpa_ph:
                            unic_out.append(arpa_corr_U[phone])
                        w_num += 1

                    # Initial-Caps words
                    elif inp == "Cm":
                        unic_out.append(arpa_corr_U[arpa_ph[0]])
                        for i in range(1, len(arpa_ph)):
                            unic_out.append(arpa_corr_m[arpa_ph[i]])
                        w_num += 1

                    # No-Caps words
                    elif inp == "mm" or inp == "m":
                        for phone in arpa_ph:
                            unic_out.append(arpa_corr_m[phone])
                        w_num += 1

                # If inp is not a word, insert the symbols directly.
                else:
                    unic_out.append(inp)

            # print "unic_out: %s" % unic_out
            context['unic_out'] = unic_out

            unic_str = ''.join(unic_out)
            print "unic_str: %s" % unic_str
            context['unic_str'] = unic_str