Exemple #1
0
def get_the_final_data():
    """
    获得最终版本的数据,数据格式为[q11,q21,q12,q22,label]
    :return:
    """
    save_data_dir = "./final_data/final_data_2/"
    if not os.path.exists(save_data_dir):
        os.mkdir(save_data_dir)
    cilinpath = "./cilin.txt"
    file_path_json = "./dataset/train_set.json"
    same_pinyin_file = "./same_pinyin.txt"
    chinese_word_freq_file = "./chinese-words.txt"
    save_data_synwords_and_samepinyin = save_data_dir + "data_replace_by_synwords_and_samepinyin.txt"
    data, true_data, false_data = read_data(file_path_json)
    data_eva = synword_and_samepinyin_data(true_data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file,
                                         chinese_word_freq_file)   # 进行数据增强
    new_data = data_eva + data   # 包含增强后的数据集
    final_data,pattern_data = get_data_pattern(new_data)
    all_data, train, dev, test = data_inverse(final_data)

    all_data_path_txt = save_data_dir + "train_set.txt"
    train_path_txt = save_data_dir + "train.txt"
    test_path_txt = save_data_dir + "test.txt"
    dev_path_txt = save_data_dir + "dev.txt"
    save_data(all_data,all_data_path_txt)
    save_data(train,train_path_txt)
    save_data(test,test_path_txt)
    save_data(dev,dev_path_txt)

    # 生成测试集
    dev_csv_path = "./dataset/dev_set.csv"
    dev_txt_path = save_data_dir + "dev_set.txt"
    dev = read_data(dev_csv_path,dev=True)
    dev_data, pattern_dev = get_data_pattern(dev,dev=True)
    save_data(dev_data, dev_txt_path, columns_num=4)
Exemple #2
0
def compare_the_same_q_in_train_and_dev(train_path,dev_path,stopword_path,otherword_path):
    """
    计算测试集和训练集中相同问题的数目
    :param train_path:  训练集路径
    :param dev_path: 测试集路径
    :param stopword_path: 停用词路径
    :param otherword_path: 其他要删掉的词,如语气词等,及请问,咨询一下......
    :return:
    """
    stopwords = stopwordList(stopword_path)
    otherword = stopwordList(otherword_path)
    # stopwords += otherword
    train_data, train_true_data, train_false_data = read_data(train_path)
    train_data = [[sample[0],sample[1]] for sample in train_data]
    train_sample = spread_list(train_data)  # 只获取文本部分内容,不获取标签部分内容
    train_question = []
    for i in tqdm(range(len(train_sample))):
        train_question.append(seg_depart(train_sample[i],stopwords,otherword))
    dev_data = read_data(dev_path,dev=True)
    dev_sample = spread_list(dev_data)
    dev_question = []
    for i in tqdm(range(len(dev_sample))):
        dev_question.append(seg_depart(dev_sample[i],stopwords,otherword))

    same_number = 0   # 记录测试集和训练集中相同问题的数目
    for question in tqdm(dev_question):
        if question in train_question:
            same_number += 1
        else:
            continue
    return same_number
Exemple #3
0
def get_the_final_data_4(dev_samples=-5000):
    """
    获得最终版本的数据,数据格式为[q11,q21,q12,q22,q31,label],并从原始训练集里切5000条数据作为测试集
    数据增强加到了10000条
    q12:q1中与q2不同的词汇
    q22:q2中与q1不同的词汇
    q31:q1与q2相同的词汇
    :return:
    """
    save_data_dir = "./final_data/final_data_6/"
    if not os.path.exists(save_data_dir):
        os.mkdir(save_data_dir)
    cilinpath = "./cilin.txt"
    file_path_json = "./dataset/train_set.json"
    same_pinyin_file = "./same_pinyin.txt"
    chinese_word_freq_file = "./chinese-words.txt"
    save_data_synwords_and_samepinyin = save_data_dir + "data_replace_by_synwords_and_samepinyin.txt"
    data, true_data, false_data = read_data(file_path_json)
    data_eva = synword_and_samepinyin_data(true_data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file,
                                         chinese_word_freq_file,portition=0.1)   # 进行数据增强
    new_data = data_eva + data   # 包含增强后的数据集
    print(len(new_data))
    all_train_data,all_train_data_pattern = get_data_pattern(new_data,mode=2)
    all_train_data,all_train_train,all_train_dev,all_train_test = data_inverse(all_train_data,mode=2)
    print(len(all_train_data))
    dev_data_from_train = new_data[dev_samples:]   # 从原始数据集里切500条数据出来作为验证集
    new_data = new_data[0:dev_samples]    # 剩下的数据作为训练集
    final_data,pattern_data = get_data_pattern(new_data,mode=2)
    all_data, train, dev, test = data_inverse(final_data,mode=2)
    dev_data_from_train_1,dev_data_from_train_pattern = get_data_pattern(dev_data_from_train,mode=2)

    all_train_data_path = save_data_dir + "all_train_data.txt"
    all_data_path_txt = save_data_dir + "train_set.txt"
    train_path_txt = save_data_dir + "train.txt"
    test_path_txt = save_data_dir + "test.txt"
    dev_path_txt = save_data_dir + "dev.txt"
    dev_from_train_path_txt = save_data_dir + "dev_split.txt"
    save_data(all_train_data,all_train_data_path,columns_num=6)
    save_data(all_data,all_data_path_txt,columns_num=6)
    save_data(train,train_path_txt,columns_num=6)
    save_data(test,test_path_txt,columns_num=6)
    save_data(dev,dev_path_txt,columns_num=6)
    save_data(dev_data_from_train_1,dev_from_train_path_txt,columns_num=6)

    # 生成测试集
    dev_csv_path = "./dataset/dev_set.csv"
    dev_txt_path = save_data_dir + "dev_set.txt"
    dev = read_data(dev_csv_path,dev=True)
    dev_data, pattern_dev = get_data_pattern(dev,dev=True,mode=2)
    save_data(dev_data, dev_txt_path, columns_num=5)
Exemple #4
0
def get_same_sample_in_train_and_test():
    """计算测试集和训练集中相同样本的数目"""
    train_path = "./train_set.json"
    dev_path = "./test_set.csv"
    stopword_path = "./stopwords.txt"
    otherword_path = "./otherwords.txt"
    save_data_dir = "./tongji/"
    if not os.path.exists(save_data_dir):
        os.mkdir(save_data_dir)
    save_same_sample_path = "./tongji/same_sample_in_train_and_test.csv"
    stopwords = stopwordList(stopword_path)
    otherword = stopwordList(otherword_path)
    # stopwords += otherword
    train_data, train_true_data, train_false_data = read_data(train_path)
    dev_data = read_data(dev_path, dev=True)
    same_sample = compare_same_sample_in_train_and_test(train_data, dev_data, save_same_sample_path, stopwords,otherword)
Exemple #5
0
def get_the_final_data_5(dev_samples=-5000):
    """
    获得最终版本的数据,数据格式为[q1,q2,label],并从原始训练集里切5000条数据作为测试集
    注意:只包含增强后的数据,不包含原始数据
    :return:
    """
    save_data_dir = "./final_data/final_data_9/"
    if not os.path.exists(save_data_dir):
        os.mkdir(save_data_dir)
    cilinpath = "./cilin.txt"
    file_path_json = "./dataset/train_set.json"
    same_pinyin_file = "./same_pinyin.txt"
    chinese_word_freq_file = "./chinese-words.txt"
    save_data_synwords_and_samepinyin = save_data_dir + "data_replace_by_synwords_and_samepinyin.txt"
    data, true_data, false_data = read_data(file_path_json)
    data_eva = synword_and_samepinyin_data(data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file,
                                         chinese_word_freq_file,portition=1)   # 进行数据增强
    # new_data = data_eva + data   # 包含增强后的数据
    new_data = data_eva
    all_train_data,all_train_data_1,all_train_data_2,all_train_dat_3 = data_inverse(new_data,pattern=False)
    dev_data_from_train = new_data[dev_samples:]  # 从原始数据集里切500条数据出来作为验证集
    new_data = new_data[0:dev_samples]  # 剩下的数据作为训练集
    all_data, train, dev, test = data_inverse(new_data,pattern=False)
    # dev_data_from_train_1, dev_data_from_train_pattern = get_data_pattern(dev_data_from_train)

    all_train_data_path = save_data_dir + "all_train_data.txt"
    all_data_path_txt = save_data_dir + "train_set.txt"
    train_path_txt = save_data_dir + "train.txt"
    test_path_txt = save_data_dir + "test.txt"
    dev_path_txt = save_data_dir +"dev.txt"
    dev_from_train_path_txt = save_data_dir + "dev_split.txt"
    save_data(all_train_data,all_train_data_path,columns_num=3)
    save_data(all_data,all_data_path_txt,columns_num=3)
    save_data(train,train_path_txt,columns_num=3)
    save_data(test,test_path_txt,columns_num=3)
    save_data(dev,dev_path_txt,columns_num=3)
    save_data(dev_data_from_train, dev_from_train_path_txt,columns_num=3)

    # 生成测试集
    dev_csv_path = "./dataset/test_set.csv"
    dev_txt_path = save_data_dir +"test_set.txt"
    dev = read_data(dev_csv_path,dev=True)
    save_data_synwords_and_samepinyin_for_dev = save_data_dir + "data_replace_by_synwords_and_samepinyin_for_dev.txt"
    data_eva = synword_and_samepinyin_data(dev, save_data_synwords_and_samepinyin_for_dev, cilinpath, same_pinyin_file,
                                           chinese_word_freq_file, columns_num=2,portition=1)  # 进行数据增强
    # dev_data, pattern_dev = get_data_pattern(dev,dev=True)
    save_data(dev, dev_txt_path, columns_num=2)
Exemple #6
0
def get_the_final_data_3(dev_samples=-5000):
    """
    获得最终版本的数据,数据格式为[q11,q21,q12,q22,label],并从原始训练集里切5000条数据作为测试集,pattern为使用通配符替换相同词汇
    :return:
    """
    save_data_dir = "./final_data/final_data_10/"
    if not os.path.exists(save_data_dir):
        os.mkdir(save_data_dir)
    cilinpath = "./cilin.txt"
    file_path_json = "./dataset/train_set.json"
    same_pinyin_file = "./same_pinyin.txt"
    chinese_word_freq_file = "./chinese-words.txt"
    save_data_synwords_and_samepinyin = save_data_dir + "data_replace_by_synwords_and_samepinyin.txt"
    data, true_data, false_data = read_data(file_path_json)
    data_eva_true = synword_and_samepinyin_data(true_data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file,
                                         chinese_word_freq_file,portition=0.2)   # 进行数据增强
    data_eva_false = synword_and_samepinyin_data(false_data, save_data_synwords_and_samepinyin, cilinpath,
                                                same_pinyin_file,
                                                chinese_word_freq_file, portition=0.3)  # 进行数据增强
    new_data = data_eva_true + data_eva_false + data   # 包含增强后的数据集
    all_train_data,all_train_data_pattern = get_data_pattern(new_data)
    all_train_data,all_train_train,all_train_dev,all_train_test = data_inverse(all_train_data)
    dev_data_from_train = new_data[dev_samples:]   # 从原始数据集里切500条数据出来作为验证集
    new_data = new_data[0:dev_samples]    # 剩下的数据作为训练集
    final_data,pattern_data = get_data_pattern(new_data)
    all_data, train, dev, test = data_inverse(final_data)
    dev_data_from_train_1,dev_data_from_train_pattern = get_data_pattern(dev_data_from_train)

    all_train_data_path = save_data_dir + "all_train_data.txt"
    all_data_path_txt = save_data_dir + "train_set.txt"
    train_path_txt = save_data_dir + "train.txt"
    test_path_txt = save_data_dir + "test.txt"
    dev_path_txt = save_data_dir + "dev.txt"
    dev_from_train_path_txt = save_data_dir + "dev_split.txt"
    save_data(all_train_data, all_train_data_path)
    save_data(all_data,all_data_path_txt)
    save_data(train,train_path_txt)
    save_data(test,test_path_txt)
    save_data(dev,dev_path_txt)
    save_data(dev_data_from_train_1,dev_from_train_path_txt)

    # 生成测试集
    dev_csv_path = "./dataset/test_set.csv"
    dev_txt_path = save_data_dir + "test_set.txt"
    dev = read_data(dev_csv_path,dev=True)
    dev_data, pattern_dev = get_data_pattern(dev,dev=True)
    save_data(dev_data, dev_txt_path, columns_num=4)
Exemple #7
0
def remove_stopwords_sample():
    """
    去掉停用词和礼貌用语,如,请问,谢谢了,之后的数据集,(train,test)
    :return: 输出去掉停用词之后得训练集和测试集
    """
    train_path = "./train_set.json"
    dev_path = "./test_set.csv"
    stopword_path = "./stopwords.txt"
    otherword_path = "./otherwords.txt"
    save_data_dir = "./tongji/"
    if not os.path.exists(save_data_dir):
        os.mkdir(save_data_dir)
    remove_stopwords_train_path = save_data_dir + "remove_stopwords_train_set.txt"
    remove_stopwords_test_path = save_data_dir + "remove_stopwords_test_set.txt"
    stopwords = stopwordList(stopword_path)
    otherword = stopwordList(otherword_path)
    # stopwords += otherword

    train_data, train_true_data, train_false_data = read_data(train_path)
    dev_data = read_data(dev_path, dev=True)

    remove_stopwords_train = []
    for sample in tqdm(train_data):
        q1 = seg_depart(sample[0],stopwords,otherword)
        q2 = seg_depart(sample[1],stopwords,otherword)
        remove_stopwords_train.append([q1,q2,sample[2]])

    remove_stopwords_test = []
    for sample in tqdm(dev_data):
        q1 = seg_depart(sample[0], stopwords,otherword)
        q2 = seg_depart(sample[1], stopwords,otherword)
        remove_stopwords_test.append([q1, q2])
    if not os.path.exists(remove_stopwords_train_path) and not os.path.exists(remove_stopwords_test_path):
        save_data(remove_stopwords_train, remove_stopwords_train_path, columns_num=3)
    save_data(remove_stopwords_test,remove_stopwords_test_path,columns_num=2)
    return remove_stopwords_train, remove_stopwords_test
Exemple #8
0
def the_average_length_of_question_in_train_dataset():
    """
    统计训练集中,每个问题的平均长度
    :return:
    """
    train_path = "./train_set.json"
    train_data, train_true_data, train_false_data = read_data(train_path)
    all_length = 0
    all_question = 0
    for sample in train_data:
        all_length += len(sample[0])
        all_length += len(sample[1])
        all_question += 2
    average_length_question = all_length/all_question
    print("the_average_length_of_question_in_train_dataset is {}".format(average_length_question))
    return average_length_question