Esempio n. 1
0
 def divideIntoSections(self):
     '''
     same as lyricsParser.divideIntoSections just class variable name self.listSyllable is different
     converts mandarin to pinyin
     divides into sections 
     '''
     
     currSectionLyrics =  []
     for syl in self.listSyllables:
             
             
         isEndOfSentence, syl.text = stripPunctuationSings(syl.text)
             
             ### convert from mandarin to pinyin
         if not syl.text == 'REST':
             cjk = CharacterLookup('C')
             textPinYinList = cjk.getReadingForCharacter(syl.text, 'Pinyin', toneMarkType='none') 
             if len(textPinYinList) > 1:
                 self.logger.warn("converted syllable {} has {} parts".format(textPinYinList, len(textPinYinList)))
             syl.text = textPinYinList[0] # take only first variant of pinyin interpretations
             
         ### finish up sentence when punctuation present        
         if isEndOfSentence:
             
             currSectionLyrics.append(syl)
             self.listSentences.append(currSectionLyrics)
             currSectionLyrics =  []
         else:
             currSectionLyrics.append(syl)
Esempio n. 2
0
def mandarinToPinyin(mandarinChar):
    cjk = CharacterLookup('C')
    textPinYinList = cjk.getReadingForCharacter(mandarinChar,
                                                'Pinyin',
                                                toneMarkType='none')
    if len(textPinYinList) > 1:
        print "converted syllable {} has {} parts".format(
            textPinYinList, len(textPinYinList))
    pinyin = textPinYinList[
        0]  # take only first variant of pinyin interpretations
    return pinyin
Esempio n. 3
0
def get_cat_code(s):
    char = unicode(s)[0]

    cjk = CharacterLookup("C")
    readings = cjk.getReadingForCharacter(char, "Pinyin")
    if not readings:
        # Not Chinese, just use first character as code
        return char.upper()

    # It's very hard to determine which reading is correct for our case,
    # so don't bother to check it, just use the first one and let users to fix
    # it if it is incorrect
    reading = readings[0]
    
    # We use the first letter as code
    return reading[0].upper()
Esempio n. 4
0
def tokenize(input, output):
    try:
        text = open(input, 'r').readlines()
    except IOError:
        print "IOError: could not open", input
        sys.exit()

    cjk = CharacterLookup('T')
    out = open(output, 'w')

    for line in text:
        line = line.decode('utf-8')
        new_line = ""
        for char in line:
            pinyin = cjk.getReadingForCharacter(char, 'Pinyin')
            if pinyin:
                new_line += char
        new_line += '\n'
        out.write(new_line.encode('utf-8'))
    out.close()
Esempio n. 5
0
def to_pinyin(filename):
        try:
                input = open(filename, 'r').readlines()
        except IOError:
                print "IOError: could not open", filename
                sys.exit()
 
        cjk = CharacterLookup('T')
 
        input = [u'我喜歡他']
 
        for line in input:
                #line = line.decode('utf-8')
                new_line = ""
                for char in line:
                        pinyin = cjk.getReadingForCharacter(char, 'Pinyin')
                        if pinyin:
                                print [unidecode(x) for x in pinyin]
                                simplified = unidecode(pinyin[0])
                                new_line += simplified + char + " "
                line = new_line
                print line
Esempio n. 6
0
try :
    ignore_bad = [u'的',u'了']
    parsed = mica.trans(src).strip()
    print "Parsed result: " + parsed
    for group in parsed.split(" ") :
        group = group.strip()
        uni = unicode(group, "UTF-8")
        trans = tryce(uni, True if len(group) == 1 else True)
        if trans == u'' :
#            print "First lookup failed for: " + group + ", trying individual. len: " + str(len(group))
            for char in uni :
                cr = tryce(char, True)
                if cr != u'' :
                    trans += cr
                else :
               #     print "Group failed and single char group failed from CE, trying readings."
                    cr = cjk.getReadingForCharacter(char,'Pinyin')
                    if cr : 
                        if char not in ignore_bad and len(cr) > 1 :
                            print "Warning: " + char + " has too many readings: "
                            for x in cr :
                                print " " + x
                        trans += cr[0]

        if trans == u'' :
            trans = "none."
        print ("Translation: " + uni + ":" + trans).replace("\n","")
except mica.error, e :
    print str(e)