def add_words_dict(word, pinyin, freq): # assume all tones are already removed assert pinyin == strip_tone(pinyin) if not word in words_dict: status = Untouched pinyins = [] pinyins.append((pinyin, freq)) words_dict[word] = (status, pinyins) else: (status, pinyins) = words_dict[word] assert Untouched == status for i, item in enumerate(pinyins): (oldpinyin, oldfreq) = item assert oldpinyin != pinyin pinyins.append((pinyin, freq))
def handle_pinyin(outfile, word, num, pinyin): # no tones in opengram dictionary stripped = strip_tone(pinyin) assert stripped == pinyin freq = 0 if not ":" in pinyin: freq = total_frequency / num else: (py, freq) = pinyin.split(":", 1) assert freq.endswith("%") freq = freq.rstrip("%") freq = float(freq) freq = total_frequency * freq pinyin = py freq = int(freq) freq = max(freq, minimum) freq = str(freq) oneline = "\t".join((word, pinyin, freq)) outfile.writelines([oneline, os.linesep])
def add_words_dict(word, pinyin, freq): pinyin = strip_tone(pinyin) if not word in words_dict: pinyins = [] pinyins.append((pinyin, freq)) words_dict[word] = pinyins else: pinyins = words_dict[word] found = False for i, item in enumerate(pinyins): (oldpinyin, oldfreq) = item if oldpinyin == pinyin: # print out the collapsed word and pinyin pair print('Collapse: {0} and {1}'.format(word, pinyin)) freq += oldfreq pinyins[i] = (pinyin, freq) found = True if not found: pinyins.append((pinyin, freq))