def divideIntoSections(self): ''' same as lyricsParser.divideIntoSections just class variable name self.listSyllable is different converts mandarin to pinyin divides into sections ''' currSectionLyrics = [] for syl in self.listSyllables: isEndOfSentence, syl.text = stripPunctuationSings(syl.text) ### convert from mandarin to pinyin if not syl.text == 'REST': cjk = CharacterLookup('C') textPinYinList = cjk.getReadingForCharacter(syl.text, 'Pinyin', toneMarkType='none') if len(textPinYinList) > 1: self.logger.warn("converted syllable {} has {} parts".format(textPinYinList, len(textPinYinList))) syl.text = textPinYinList[0] # take only first variant of pinyin interpretations ### finish up sentence when punctuation present if isEndOfSentence: currSectionLyrics.append(syl) self.listSentences.append(currSectionLyrics) currSectionLyrics = [] else: currSectionLyrics.append(syl)
def mandarinToPinyin(mandarinChar): cjk = CharacterLookup('C') textPinYinList = cjk.getReadingForCharacter(mandarinChar, 'Pinyin', toneMarkType='none') if len(textPinYinList) > 1: print "converted syllable {} has {} parts".format( textPinYinList, len(textPinYinList)) pinyin = textPinYinList[ 0] # take only first variant of pinyin interpretations return pinyin
def get_cat_code(s): char = unicode(s)[0] cjk = CharacterLookup("C") readings = cjk.getReadingForCharacter(char, "Pinyin") if not readings: # Not Chinese, just use first character as code return char.upper() # It's very hard to determine which reading is correct for our case, # so don't bother to check it, just use the first one and let users to fix # it if it is incorrect reading = readings[0] # We use the first letter as code return reading[0].upper()
def tokenize(input, output): try: text = open(input, 'r').readlines() except IOError: print "IOError: could not open", input sys.exit() cjk = CharacterLookup('T') out = open(output, 'w') for line in text: line = line.decode('utf-8') new_line = "" for char in line: pinyin = cjk.getReadingForCharacter(char, 'Pinyin') if pinyin: new_line += char new_line += '\n' out.write(new_line.encode('utf-8')) out.close()
def to_pinyin(filename): try: input = open(filename, 'r').readlines() except IOError: print "IOError: could not open", filename sys.exit() cjk = CharacterLookup('T') input = [u'我喜歡他'] for line in input: #line = line.decode('utf-8') new_line = "" for char in line: pinyin = cjk.getReadingForCharacter(char, 'Pinyin') if pinyin: print [unidecode(x) for x in pinyin] simplified = unidecode(pinyin[0]) new_line += simplified + char + " " line = new_line print line
try : ignore_bad = [u'的',u'了'] parsed = mica.trans(src).strip() print "Parsed result: " + parsed for group in parsed.split(" ") : group = group.strip() uni = unicode(group, "UTF-8") trans = tryce(uni, True if len(group) == 1 else True) if trans == u'' : # print "First lookup failed for: " + group + ", trying individual. len: " + str(len(group)) for char in uni : cr = tryce(char, True) if cr != u'' : trans += cr else : # print "Group failed and single char group failed from CE, trying readings." cr = cjk.getReadingForCharacter(char,'Pinyin') if cr : if char not in ignore_bad and len(cr) > 1 : print "Warning: " + char + " has too many readings: " for x in cr : print " " + x trans += cr[0] if trans == u'' : trans = "none." print ("Translation: " + uni + ":" + trans).replace("\n","") except mica.error, e : print str(e)