def gen_pinyin2hanzi(pinyin, num=5): results = {} hmmparams = DefaultHmmParams() result = viterbi(hmm_params=hmmparams, observations=pinyin, path_num=num, log=True) for item in result: # results.setdefault(''.join(item.path),0) results[''.join(item.path)] = item.score return results
def top_k_transform(importance_score, list_of_texts, porpotion, new_word_dictionary, black_list_word): hmmparams = DefaultHmmParams() # HMM pinyin2hanzi target_text = list_of_texts target_text = tokenize(target_text).split(' ') k = int(len(target_text)*porpotion) + 1 top_k_score = heapq.nlargest(k, importance_score) top_k_score_index = [importance_score.index(score) for score in top_k_score] for index in top_k_score_index: # make a virables repsent modified list_of_text gedit_text = copy.deepcopy(list_of_texts) if(is_Chinese(target_text[index])): pinyin_of_target_text = lazy_pinyin(target_text[index]) if pinyin_of_target_text == ['ni']: pinyin_of_target_text = random.choice([['li'], ['ni']]) if pinyin_of_target_text == ['ta']: pinyin_of_target_text = random.choice([['ta'], ['te']]) if pinyin_of_target_text == ['cao']: pinyin_of_target_text = random.choice([['ca'], ['cao']]) if pinyin_of_target_text == ['ma']: pinyin_of_target_text = random.choice([['me'], ['ma']]) if pinyin_of_target_text == ['si']: pinyin_of_target_text = random.choice([['shi'], ['si']]) try: #pinyin to other Chinese hanzi_of_target_test = viterbi(hmm_params=hmmparams, observations=pinyin_of_target_text, path_num = 10) # choose a word randly # target_text[index] = ''.join(random.choice(hanzi_of_target_test).path) # caculate the similarity between original word and transferable word # use greedy algorithm m_destination_word = calculate_similarity(target_text, index, list_of_texts[i], hanzi_of_target_test, gedit_text, black_list_word) target_text[index] = m_destination_word list_of_texts = ''.join(target_text) # 加入新词字典 temp = new_word_dictionary.get(m_destination_word,0) temp += 1 # 如果这个新词已经出现了10次,那么把它加到黑名单里 if(temp < 20): new_word_dictionary[m_destination_word] = temp else: new_word_dictionary.pop(m_destination_word) black_list_word.append(m_destination_word) except: pass else: continue return list_of_texts
def __init__(self): # 初始化 self.hmmparams = DefaultHmmParams() self.dagparams = DefaultDagParams() self.result = '' self.shengmu = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'w', 'x', 'y', 'z', 'ch', 'sh', 'zh'] self.yy = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'en', 'eng', 'er', 'o', 'ou', 'ong'] self.ym_b = ["a", "ai", "an", "ang", "ao", "ei", "en", "eng", "i", "ian", "iao", "ie", "in", "ing", "o", "u"] self.ym_c = ["a", "ai", "an", "ang", "ao", "e", "en", "eng", "i", "ong", "ou", "u", "uan", "ui", "un", "uo"] self.ym_d = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ia", "ian", "iao", "ie", "ing", "iu", "ong", "ou", "u", "uan", "ui", "un", "uo"] self.ym_f = ["a", "an", "ang", "ei", "en", "eng", "iao", "o", "ou", "u"] self.ym_g = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "ong", "ou", "u", "uai", "uan", "uang", "ui", "un", "uo"] self.ym_h = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "ong", "ou", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo"] self.ym_j = ["i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "u", "uan", "ue", "un"] self.ym_k = ["a", "ai", "an", "ang", "ao", "e", "en", "eng", "ong", "ou", "u", "ui", "un", "uo"] self.ym_l = ["a", "ai", "an", "ang", "ao", "e", "ei", "eng", "i", "ia", "ian", "iao", "ie", "in", "ing", "iu", "o", "ong", "ou", "u", "uan", "un", "uo", "v", "ve"] self.ym_m = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ian", "iao", "ie", "in", "ing", "iu", "o", "ou", "u"] self.ym_n = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ian", "iang", "iao", "ie", "in", "ing", "iu", "ong", "ou", "u", "uan", "un", "uo", "v", "ve"] self.ym_p = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ian", "iao", "ie", "in", "ing", "o", "ou", "u"] self.ym_q = ["i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "u", "uan", "ue", "un"] self.ym_r = ["an", "ang", "ao", "e", "en", "eng", "i", "ong", "ou", "u", "ua", "uan", "ui", "un", "uo"] self.ym_s = ["a", "ai", "an", "ang", "ao", "e", "en", "eng", "i", "ong", "ou", "u", "uan", "ui", "un", "uo"] self.ym_t = ["a", "ai", "an", "ang", "ao", "e", "ei", "eng", "i", "ian", "iao", "ie", "ing", "ong", "ou", "u", "uan", "ui", "un", "uo"] self.ym_w = ["a", "ai", "an", "ang", "ei", "en", "eng", "o", "u"] self.ym_x = ["i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "u", "uan", "ue", "un"] self.ym_y = ["a", "an", "ang", "ao", "e", "i", "in", "ing", "o", "ong", "ou", "u", "uan", "ue", "un"] self.ym_z = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ong", "ou", "u", "uan", "ui", "un", "uo"] self.ym_ch = ["a", "ai", "an", "ang", "ao", "e", "en", "eng", "i", "ong", "ou", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo"] self.ym_sh = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ou", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo"] self.ym_zh = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ong", "ou", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo"] self.ym = [self.yy, self.ym_b, self.ym_c, self.ym_d, self.ym_f, self.ym_g, self.ym_h, self.ym_j, self.ym_k, self.ym_l, self.ym_m, self.ym_n, self.ym_p, self.ym_q, self.ym_r, self.ym_s, self.ym_t, self.ym_w, self.ym_x, self.ym_y, self.ym_z, self.ym_ch, self.ym_sh, self.ym_zh ]
def __init__(self, useWhat='pickle', backen=False): global backenFlag self.fileOK = False self.useWhat = useWhat self.DagsPath = None self.database = None self.pickle = None self.backen = backen if self.backen: try: from Pinyin2Hanzi import DefaultHmmParams from Pinyin2Hanzi import viterbi self.hmmparams = DefaultHmmParams() self.viterbi = viterbi except: raise Exception('lost Pinyin2Hanzi package,' 'please find that package ,' 'or set backen=False') self.CheckFiles()
#!/usr/bin/env python # -*- coding: utf-8 -*- from pypinyin import pinyin, lazy_pinyin, Style from Pinyin2Hanzi import DefaultHmmParams from Pinyin2Hanzi import viterbi txt = u'锄禾日当午' py = lazy_pinyin(txt) hmmparams = DefaultHmmParams() result = viterbi(hmm_params=hmmparams, observations=py, path_num = 1) for item in result: txt_rtn = u''.join(item.path) if txt == txt_rtn: print u'OK' else: print u'Error: %s -> %s -> %s' % (txt, py, txt_rtn)
# -*- coding:UTF-8 -*- # Create time: 2019-12-20 # Code by: hjfzzm import socket import threading from Pinyin2Hanzi import DefaultHmmParams from Pinyin2Hanzi import is_pinyin from Pinyin2Hanzi import simplify_pinyin from Pinyin2Hanzi import viterbi # 全局变量 re_string = "" # host = '172.17.228.35' host = '127.0.0.1' port = 10086 hmm = DefaultHmmParams() class PyTrieNode(object): def __init__(self, key="", seq=None): if seq is None: seq = [] self.key = key self.end = len(seq) == 0 self.children = {} if len(seq) > 0: self.children[seq[0]] = PyTrieNode(seq[0], seq[1:]) def add(self, seq): if len(seq) == 0: self.end = True
def transform(line, tf_idf_score, new_word_dictionary, black_list_word): """转换一行文本。 :param line: 对抗攻击前的输入文本 :type line: str :returns: str -- 对抗攻击后的输出文门 """ # 修改以下逻辑 from preprocessing_module import preprocess_text preprocess_text(line) # 选择修改文本的比例 a = random.choice([1, 0, 2, 5, 4]) if a >= 6: return line hmmparams = DefaultHmmParams() # HMM pinyin2hanzi #进行重要度排序,得出每个词的辱骂性质的分数 imp_score = importance(line, tf_idf_score) #修改一定比例的词语, 当比例为0时,最低为一个 out_line = top_k_transform(imp_score, line, 0, new_word_dictionary, black_list_word) out_line = "".join(out_line) out_line = out_line.replace('\n', '') m_line = tokenize(out_line) _list_m_line = [] for _word in m_line: _list_m_line.append(_word) #将“你”这个字进行替换 for i, m_word in enumerate(m_line): if m_word in important_words: hanzi_of_target_test = '' pinyin_of_target_text = lazy_pinyin(m_word) if pinyin_of_target_text == ['ni']: hanzi_of_target_test = dict_word['ni'] else: continue m_destination_word = m_word # pinyin to other Chinese nums_circle = 0 #选择一个汉字原始汉字不同且不在黑名单里 while nums_circle <= 50: nums_circle += 1 m_destination_word = random.choice(hanzi_of_target_test) if m_destination_word != m_word and m_destination_word not in black_list_word: break else: continue _list_m_line[i] = m_destination_word m_line = ''.join(_list_m_line) temp = new_word_dictionary.get(m_destination_word, 0) temp += 1 # 如果这个新词已经出现了30次,那么把它加到黑名单里 if (temp < 30): new_word_dictionary[m_destination_word] = temp else: new_word_dictionary.pop(m_destination_word) black_list_word.append(m_destination_word) out_line = m_line.split() out_line = ''.join(out_line) _line = out_line str_dot = '' #求出最起始比例 _ori_pro = reference_model(model, _line) _nums = 0 #在句子末尾加逗号,至多50个,(当前概率-原始概率)/原始概率>0.8时停止 for i in range(50): _line += ',' _nums += 1 _pre_pro = reference_model(model, _line) if abs(_pre_pro - _ori_pro)/_ori_pro > 0.8: break out_line = _line + str_dot print('outline,', out_line) return out_line