def preprocess_example(test_args=( '慶曆四年春,滕(téng)子京謫(zhé)守巴陵郡。越明年,政通人和,百廢俱興。乃 重修岳陽樓,增其舊制,刻唐賢、今人詩賦於其上。', )): """ 调用评测中使用的预处理函数; 需要mafan """ from preprocessing_module import preprocess_text print('----Text preprocessing----') print(*test_args, preprocess_text(*test_args))
def transform(line, tf_idf_score, new_word_dictionary, black_list_word): """转换一行文本。 :param line: 对抗攻击前的输入文本 :type line: str :returns: str -- 对抗攻击后的输出文门 """ # 修改以下逻辑 from preprocessing_module import preprocess_text preprocess_text(line) # 选择修改文本的比例 a = random.choice([1, 0, 2, 5, 4]) if a >= 6: return line hmmparams = DefaultHmmParams() # HMM pinyin2hanzi #进行重要度排序,得出每个词的辱骂性质的分数 imp_score = importance(line, tf_idf_score) #修改一定比例的词语, 当比例为0时,最低为一个 out_line = top_k_transform(imp_score, line, 0, new_word_dictionary, black_list_word) out_line = "".join(out_line) out_line = out_line.replace('\n', '') m_line = tokenize(out_line) _list_m_line = [] for _word in m_line: _list_m_line.append(_word) #将“你”这个字进行替换 for i, m_word in enumerate(m_line): if m_word in important_words: hanzi_of_target_test = '' pinyin_of_target_text = lazy_pinyin(m_word) if pinyin_of_target_text == ['ni']: hanzi_of_target_test = dict_word['ni'] else: continue m_destination_word = m_word # pinyin to other Chinese nums_circle = 0 #选择一个汉字原始汉字不同且不在黑名单里 while nums_circle <= 50: nums_circle += 1 m_destination_word = random.choice(hanzi_of_target_test) if m_destination_word != m_word and m_destination_word not in black_list_word: break else: continue _list_m_line[i] = m_destination_word m_line = ''.join(_list_m_line) temp = new_word_dictionary.get(m_destination_word, 0) temp += 1 # 如果这个新词已经出现了30次,那么把它加到黑名单里 if (temp < 30): new_word_dictionary[m_destination_word] = temp else: new_word_dictionary.pop(m_destination_word) black_list_word.append(m_destination_word) out_line = m_line.split() out_line = ''.join(out_line) _line = out_line str_dot = '' #求出最起始比例 _ori_pro = reference_model(model, _line) _nums = 0 #在句子末尾加逗号,至多50个,(当前概率-原始概率)/原始概率>0.8时停止 for i in range(50): _line += ',' _nums += 1 _pre_pro = reference_model(model, _line) if abs(_pre_pro - _ori_pro)/_ori_pro > 0.8: break out_line = _line + str_dot print('outline,', out_line) return out_line
for i in range(50): _line += ',' _nums += 1 _pre_pro = reference_model(model, _line) if abs(_pre_pro - _ori_pro)/_ori_pro > 0.8: break out_line = _line + str_dot print('outline,', out_line) return out_line import time start = time.clock() #当中是你的程序 from preprocessing_module import preprocess_text benchmark_text = [preprocess_text(_line) for _line in inp_lines] tf_idf_score = tfidf_score_of_word(benchmark_text) new_word_dictionary = {} out_lines = [] black_list_word = [] for i, sen in enumerate(benchmark_text): out_lines.append(transform(sen, tf_idf_score[i], new_word_dictionary, black_list_word)) # out_lines = [transform(sen, tf_idf_score, new_word_dictionary) for sen in benchmark_text] with open('my.txt', 'w', encoding='utf-8') as w: for m_line in out_lines:
def main(): # 预先测试模型 model = fasttext.load_model(args.model_path) with open(args.input_file, 'r', encoding='utf-8') as file: lines = file.readlines() out_lines = [] for line in lines: # 文本清理 text = preprocess_text(line) # 文本分词 words = [word for word in jieba.cut(text)] _temp = [] for word in words: if word in DEFAULT_KEYVEC.vocab: _temp.append(word) else: for ch in word: if ch in DEFAULT_KEYVEC.vocab: _temp.append(ch) else: continue words = _temp # print(words) out = model.predict(text) # 第一步:计算文本重要性 importances = compute_word_importance(model, words) importeance_order = reversed(np.argsort(importances)) # print(importances, importeance_order) # 第二步:选出候选单词 candidates = dict() for word in words: top_words = DEFAULT_KEYVEC.most_similar(positive=word, topn=300) sim_words = [] for idx, (_word, _score) in enumerate(reversed(top_words)): # if _score < 0.03: _word = preprocess_text(_word) _cutword = [_ for _ in jieba.cut(_word)] if len(_word) and len(_cutword) == 1: sim_words.append(_word) # else: # break if len(sim_words) > 100: break candidates[word] = sim_words # Jaccord for key in candidates.keys(): _candidate = [] for candidate in candidates[key]: dis = dis_utils.normalized_levenshtein(key, candidate) # print(dis) if dis > 0.5: _candidate.append(candidate) candidates[key] = _candidate # print(candidates) # break # 第三步:词性过滤 # 第四步:语句相似性 # 第五步:替换文本 for order in importeance_order: if len(candidates[words[order]]) == 0: continue temp = [] for candidate in candidates[words[order]]: temp_words = deepcopy(words) temp_words[order] = candidate temp.append(''.join(temp_words)) preds = model.predict(temp) # 区分如果存在 preds_order = np.argsort(preds[1].reshape(-1)) # print(preds_order) flag = -1 for pred_order in preds_order: if preds[0][pred_order][0] != out[0][0]: # print(preds[0][pred_order][0], out[0][0]) flag = pred_order break if flag != -1: words[order] = candidates[words[order]][flag] break else: words[order] = candidates[words[order]][0] out_lines.append(''.join(words) + '\n') target = json.dumps({'text': out_lines}, ensure_ascii=False) with open(args.output_file, 'w', encoding='utf-8') as f: f.write(target)