Python preprocess_textの例

プログラミング言語: Python

名前空間/パッケージ名: preprocessing_module

メソッド/関数: preprocess_text

hotexamples.comのコード掲載数: 4

Python preprocess_text - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのpreprocessing_module.preprocess_textの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def preprocess_example(test_args=(
    '慶曆四年春，滕（téng）子京謫（zhé）守巴陵郡。越明年，政通人和，百廢俱興。乃 重修岳陽樓，增其舊制，刻唐賢、今人詩賦於其上。', )):
    """
    调用评测中使用的预处理函数; 需要mafan
    """
    from preprocessing_module import preprocess_text
    print('----Text preprocessing----')
    print(*test_args, preprocess_text(*test_args))

コード例 #2

ファイルを表示

ファイル: main.py プロジェクト: XckCodeDD/SecurityAI3_IDINVADE

def transform(line, tf_idf_score, new_word_dictionary, black_list_word):
    """转换一行文本。

    :param line: 对抗攻击前的输入文本
    :type line: str
    :returns: str -- 对抗攻击后的输出文门
    """
    # 修改以下逻辑
    from preprocessing_module import preprocess_text

    preprocess_text(line)
    # 选择修改文本的比例
    a = random.choice([1, 0, 2, 5, 4])
    if a >= 6:
        return line
    hmmparams = DefaultHmmParams()  # HMM pinyin2hanzi


    #进行重要度排序，得出每个词的辱骂性质的分数
    imp_score = importance(line, tf_idf_score)


    #修改一定比例的词语， 当比例为0时，最低为一个
    out_line = top_k_transform(imp_score, line, 0, new_word_dictionary, black_list_word)
    out_line = "".join(out_line)
    out_line = out_line.replace('\n', '')
    m_line = tokenize(out_line)

    _list_m_line = []
    for _word in m_line:
        _list_m_line.append(_word)

    #将“你”这个字进行替换

    for i, m_word in enumerate(m_line):
        if m_word in important_words:
            hanzi_of_target_test = ''
            pinyin_of_target_text = lazy_pinyin(m_word)
            if pinyin_of_target_text == ['ni']:
                hanzi_of_target_test = dict_word['ni']
            else:
                continue
            m_destination_word = m_word
            # pinyin to other Chinese
            nums_circle = 0
            #选择一个汉字原始汉字不同且不在黑名单里
            while nums_circle <= 50:
                nums_circle += 1
                m_destination_word = random.choice(hanzi_of_target_test)
                if m_destination_word != m_word and m_destination_word not in black_list_word:
                    break
                else:
                    continue
            _list_m_line[i] = m_destination_word

            m_line = ''.join(_list_m_line)

            temp = new_word_dictionary.get(m_destination_word, 0)
            temp += 1
            # 如果这个新词已经出现了30次，那么把它加到黑名单里
            if (temp < 30):
                new_word_dictionary[m_destination_word] = temp
            else:
                new_word_dictionary.pop(m_destination_word)
                black_list_word.append(m_destination_word)
    out_line = m_line.split()
    out_line = ''.join(out_line)
    _line = out_line
    str_dot = ''

    #求出最起始比例
    _ori_pro = reference_model(model, _line)
    _nums = 0

    #在句子末尾加逗号，至多50个，（当前概率-原始概率）/原始概率>0.8时停止
    for i in range(50):
        _line += ','
        _nums += 1
        _pre_pro = reference_model(model, _line)
        if abs(_pre_pro - _ori_pro)/_ori_pro > 0.8:
            break
    out_line = _line + str_dot
    print('outline,', out_line)
    return out_line

コード例 #3

ファイルを表示

ファイル: main.py プロジェクト: XckCodeDD/SecurityAI3_IDINVADE

    for i in range(50):
        _line += ','
        _nums += 1
        _pre_pro = reference_model(model, _line)
        if abs(_pre_pro - _ori_pro)/_ori_pro > 0.8:
            break
    out_line = _line + str_dot
    print('outline,', out_line)
    return out_line
import time

start = time.clock()

#当中是你的程序
from preprocessing_module import preprocess_text
benchmark_text = [preprocess_text(_line) for _line in inp_lines]



tf_idf_score = tfidf_score_of_word(benchmark_text)
new_word_dictionary = {}

out_lines = []
black_list_word = []

for i, sen in enumerate(benchmark_text):
    out_lines.append(transform(sen, tf_idf_score[i], new_word_dictionary, black_list_word))

# out_lines = [transform(sen, tf_idf_score, new_word_dictionary) for sen in benchmark_text]
with open('my.txt', 'w', encoding='utf-8') as w:
    for m_line in out_lines:

コード例 #4

ファイルを表示

ファイル: main.py プロジェクト: Codle/text-attack

def main():
    # 预先测试模型
    model = fasttext.load_model(args.model_path)
    with open(args.input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    out_lines = []
    for line in lines:
        # 文本清理
        text = preprocess_text(line)
        # 文本分词
        words = [word for word in jieba.cut(text)]
        _temp = []
        for word in words:
            if word in DEFAULT_KEYVEC.vocab:
                _temp.append(word)
            else:
                for ch in word:
                    if ch in DEFAULT_KEYVEC.vocab:
                        _temp.append(ch)
                    else:
                        continue
        words = _temp
        # print(words)
        out = model.predict(text)

        # 第一步：计算文本重要性
        importances = compute_word_importance(model, words)
        importeance_order = reversed(np.argsort(importances))
        # print(importances, importeance_order)

        # 第二步：选出候选单词
        candidates = dict()
        for word in words:
            top_words = DEFAULT_KEYVEC.most_similar(positive=word, topn=300)
            sim_words = []
            for idx, (_word, _score) in enumerate(reversed(top_words)):

                # if _score < 0.03:
                _word = preprocess_text(_word)
                _cutword = [_ for _ in jieba.cut(_word)]
                if len(_word) and len(_cutword) == 1:
                    sim_words.append(_word)
                # else:
                #     break
                if len(sim_words) > 100:
                    break
            candidates[word] = sim_words

        # Jaccord
        for key in candidates.keys():
            _candidate = []
            for candidate in candidates[key]:
                dis = dis_utils.normalized_levenshtein(key, candidate)
                # print(dis)
                if dis > 0.5:
                    _candidate.append(candidate)
            candidates[key] = _candidate

        # print(candidates)
        # break
        # 第三步：词性过滤
        # 第四步：语句相似性

        # 第五步：替换文本
        for order in importeance_order:
            if len(candidates[words[order]]) == 0:
                continue
            temp = []
            for candidate in candidates[words[order]]:
                temp_words = deepcopy(words)
                temp_words[order] = candidate
                temp.append(''.join(temp_words))
            preds = model.predict(temp)
            # 区分如果存在
            preds_order = np.argsort(preds[1].reshape(-1))
            # print(preds_order)
            flag = -1
            for pred_order in preds_order:
                if preds[0][pred_order][0] != out[0][0]:
                    # print(preds[0][pred_order][0], out[0][0])
                    flag = pred_order
                    break
            if flag != -1:
                words[order] = candidates[words[order]][flag]
                break
            else:
                words[order] = candidates[words[order]][0]
        out_lines.append(''.join(words) + '\n')

    target = json.dumps({'text': out_lines}, ensure_ascii=False)
    with open(args.output_file, 'w', encoding='utf-8') as f:
        f.write(target)