def load_data(binary_data_path):
    start_time = time.time()
    binary_data_dic = eval(read_file(binary_data_path))
    end_time = time.time()
    print('读取数据用时:{}s'.format(end_time - start_time))

    return binary_data_dic
def con_to_levn(con, ori_datas_path, le_file_path, save_path, le_n, title_n):
    '''
    param ori_datas_path:txt文件,索引为id、标题、关键词、摘要、中图分类号、母体文献
    param le_file_path:层级目录文件
    param save_path:标注的数据集存储路径
    param le_n:输入层级
    param title_n:抽取的标题重复次数
    output:标注好的分类文件夹,每个文件夹代表一个n级类别,类别中有多个文本数据。
    '''
    train_path = save_path + 'train/'
    test_path = save_path + 'test/'
    if not os.path.exists(train_path):
        os.makedirs(train_path)
    if not os.path.exists(test_path):
        os.makedirs(test_path)

    start2 = time.time()
    #读取数据
    #con = read_datas(ori_datas_path)
    levs = list(con['中图分类号'])

    # 读取层级目录
    read_Ch_cate(save_path, le_file_path, le_n)
    le_n_names = read_file(save_path + 'level_' + str(le_n) + '.txt',
                           'utf-8').split(',')

    # 抽取n级目录数据
    select_txt(le_n, con, le_n_names, levs, train_path, test_path,
               stopword_path)
    end2 = time.time()
    run_slect_time = round(end2 - start2, 3)
    print('生成数据集运行时间:' + str(run_slect_time) + 's')
Esempio n. 3
0
def statistics_corpus(data_path):
    print('='*40)
    print('I-2.统计数据...')
    level_dic = {}
    level1_count = 0
    all_doc_count = 0
    for level in os.listdir(data_path):
        if '.txt' in level or level.startswith('.'):
            continue
        temp_count = 0
        le1_list = os.listdir(data_path+level+'/')
        if len(le1_list):
            level_dic[level] = []
        else:
            os.rmdir(data_path+level+'/')
        for file in le1_list:
            if 'count' in file:
                con = read_file(data_path+level+'/'+file,'utf-8').split('\n')
                level1_count += len(con)-1
                level_dic[level].append(len(con)-1)
                for i in con[:-1]:
                    num = i.split('-->')[1]
                    temp_count += int(num)
                level_dic[level].append(temp_count)
                all_doc_count += temp_count
                print(str(len(con)-1)+'/'+str(temp_count))
    write_data(data_path+'statistics.txt','level-1\t二级类别数目\t文档数目\n','single')
    write_data(data_path+'statistics.txt',level_dic.items())
    write_data(data_path+'statistics.txt','\n总二级类别数目:%d'%level1_count+'\n总文档数目:%d'%all_doc_count,'single')
Esempio n. 4
0
def merge_level3(file_path, fp):
    for level in os.listdir(file_path):
        if '.txt' in level:
            continue
        for file in os.listdir(file_path + level + '/'):
            if 'count' in file:
                continue
            con = read_file(file_path + level + '/' + file, 'utf-8')
            fp.write(con)
Esempio n. 5
0
def con_to_levn(con, ori_datas_path, le_path_list, save_path, class_info,
                title_n):
    '''
    param ori_datas_path:txt文件,索引为id、标题、关键词、摘要、中图分类号、母体文献
    param le_path_list:层级目录文件
    param save_path:标注的数据集存储路径
    param class_info:输入层级
    param title_n:抽取的标题重复次数
    output:标注好的分类文件夹,每个文件夹代表一个n级类别,类别中有多个文本数据。
    '''
    train_path = save_path + 'train/'
    test_path = save_path + 'test/'
    if not os.path.exists(train_path):
        os.makedirs(train_path)
    if not os.path.exists(test_path):
        os.makedirs(test_path)

    #读取数据
    # con = read_datas(ori_datas_path)
    #
    levs = []
    levs_area = list(con['区域分类'])
    levs_industry = list(con['行业分类'])
    levs_subject = list(con['学科分类'])
    for i in range(len(levs_subject)):
        levs_subject[i] = str(levs_subject[i])
    levs_china_class = list(con['中图分类'])

    levs.append(levs_area)
    levs.append(levs_industry)
    levs.append(levs_subject)
    levs.append(levs_china_class)

    classes = list(class_info.keys())
    lens = list(class_info.values())

    for i in range(len(le_path_list)):
        # 读取层级目录
        start2 = time.time()
        read_Ch_cate(save_path, le_path_list[i], classes[i], lens[i])
        le_n_names = read_file(
            save_path + 'level_' + classes[i] + '_' + str(lens[i]) + '.txt',
            'utf-8').split(',')
        # 抽取n级目录数据
        select_txt(classes[i], con, le_n_names, levs[i],
                   train_path + classes[i], test_path, stopword_path, lens[i])
        end2 = time.time()
        run_slect_time = round(end2 - start2, 3)
        print('生成数据集运行时间:' + str(run_slect_time) + 's')
def train_word2vec(train_path, model_save_path):
    data = read_file(train_path, 'utf-8').split('\n')
    start = time.time()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    model = Word2Vec(data,
                     sg=0,
                     hs=1,
                     size=200,
                     window=5,
                     min_count=5,
                     iter=50)  #右边是:sg=1;;左边是:sg=0
    model.save(model_save_path + 'w2v_iter_20_model')
    end = time.time()

    print('训练时间%s' % str(start - end))
Esempio n. 7
0
def get_level3_name(file_path, save_path):
    level3_dic = {}

    t = open(save_path, 'a', encoding='utf-8')
    for level in os.listdir(file_path):
        if '.txt' in level:
            continue
        level3_dic[level] = []
        for file in os.listdir(file_path + level + '/'):
            if 'count' in file:
                con = read_file(file_path + level + '/' + file,
                                'utf-8').split('\n')
                for i in con[:-2]:
                    level3 = i.split('-->')[0]
                    t.write(level3 + '\n')
                    print(level3)
                    level3_dic[level].append(level3)
    print(level3_dic)
Esempio n. 8
0
def count_level1(file_path):
    level_dic = {}
    level1_count = 0
    for level in os.listdir(file_path):
        if '.txt' in level:
            continue
        temp_count = 0
        level_dic[level] = []
        for file in os.listdir(file_path + level + '/'):
            if 'count' in file:
                con = read_file(file_path + level + '/' + file,
                                'utf-8').split('\n')
                level1_count += len(con) - 2
                level_dic[level].append(len(con) - 2)

                for i in con[:-2]:
                    num = i.split('-->')[1]
                    temp_count += int(num[:-1])
                level_dic[level].append(temp_count)
                print(str(len(con) - 2) + '/' + str(temp_count))
    print(str(level_dic))
    print(level1_count)
Esempio n. 9
0
def div_train_test(file_path, train_path, test_path):
    if not os.path.exists(train_path):
        os.makedirs(train_path)
    if not os.path.exists(test_path):
        os.makedirs(test_path)
    lev1 = os.listdir(file_path)

    for l1 in lev1:
        if '.txt' in l1:
            continue
        if len(os.listdir(file_path + l1 + '/')) == 0:
            continue
        file = file_path + l1 + '/' + l1 + '_train.txt'
        count_file = file_path + l1 + '/' + l1 + '_train_count.txt'

        save_train_path = train_path + l1 + '/'
        save_test_path = test_path + l1 + '/'

        if not os.path.exists(save_train_path):
            os.makedirs(save_train_path)
        if not os.path.exists(save_test_path):
            os.makedirs(save_test_path)

        save_train_file = save_train_path + l1 + '_train.txt'
        save_test_file = save_test_path + l1 + '_test.txt'
        save_tr_count = save_train_path + l1 + '_train_count.txt'
        save_te_count = save_test_path + l1 + '_test_count.txt'

        train_fp = open(save_train_file, 'w', encoding='utf-8')
        train_count_fp = open(save_tr_count, 'w', encoding='utf-8')
        test_fp = open(save_test_file, 'w', encoding='utf-8')
        test_count_fp = open(save_te_count, 'w', encoding='utf-8')

        con = read_file(file, 'utf-8').split('\n')
        count_con = read_file(count_file, 'utf-8').split('\n')

        lev3_test_dic = {}
        #统计三级类数目,确定划分比例
        for l3 in count_con[:-2]:
            le3_count = l3.split('-->')
            num = le3_count[1].replace(',', '')
            test_num = int(num) / 5  #按照4:1划分数据集
            train_num = int(int(num) - int(test_num))
            lev3_test_dic[le3_count[0]] = int(test_num)
            train_count_fp.write(le3_count[0] + '-->' + str(train_num) + '\n')
            test_count_fp.write(le3_count[0] + '-->' + str(int(test_num)) +
                                '\n')

        # print(lev3_test_dic)
        train_count_fp.write(str(count_con[-2]))
        test_count_fp.write(str(count_con[-2]))

        temp_test_count = 0
        temp_label = list(lev3_test_dic.keys())[0]
        for act in con[:-1]:
            row = act.split(',')
            label = row[0].split('__')[-1]

            test_count = lev3_test_dic[label]
            if label == temp_label:
                if temp_test_count <= test_count:
                    test_fp.write(act)
                else:
                    train_fp.write(act)
                temp_label = label
                temp_test_count += 1
            else:
                temp_test_count = 1
                train_fp.write(act)
                temp_label = label
Esempio n. 10
0
def select_txt(le_n, con, le_n_names, levs, train_path, test_path,
               stopword_path):
    print('抽取数据,生成训练和测试数据集...')
    stopword = read_file(stopword_path, 'utf').split('\n')
    #分类的类别数量
    count_train_1 = 0
    count_test_1 = 0
    #临时数据保存
    train_list = []
    test_list = []
    #三级类数目统计
    le3_count_cate = 0
    #类别标志
    temp_cate = le_n_names[0][0]

    for i in le_n_names:  #n级目录列表
        #类别数据个数统计
        count_train_3 = 0
        count_test_3 = 0
        #临时保存数据
        test_1 = []
        train_1 = []
        test_3 = []
        train_3 = []

        #训练文件集一级类名字
        le1_name = i[0]
        len_num = 1

        if '/' in i:
            i = i.replace(r'/', ' ')

        le_3_train_path = train_path + i[0] + '/'
        le_3_test_path = test_path + i[0] + '/'

        if not os.path.exists(le_3_train_path):
            os.makedirs(le_3_train_path)
        if not os.path.exists(le_3_test_path):
            os.makedirs(le_3_test_path)

        for m in enumerate(levs[:]):  #原始数据分类号列表
            m_list = []
            try:
                if ';' in m[1]:  #类别数大于1时
                    m_list = m[1].split(';')
                else:
                    m_list.append(m[1])
            except:
                continue
            for p in m_list:
                #print(p[0])
                if len(p) == 0:
                    continue
                elif i[0] == p[0] and i in p:  #符合级目录则抽取
                    index = m[0]  #获得待抽取数据索引
                    item = con.loc[index]
                    title = item['标题']  #抽取数据标题
                    content = item['摘要']  #抽取数据摘要,作为文本内容
                    key_word = item['关键词']  #抽取数据关键词
                    content = title + ' ' + content + ' ' + key_word
                    try:
                        if len_num % 4 == 0:
                            test_1.append(deal_datas(i[0], content, stopword))
                            test_3.append(deal_datas(i, content, stopword))
                            count_test_3 += 1
                        else:
                            train_1.append(deal_datas(i[0], content, stopword))
                            train_3.append(deal_datas(i, content, stopword))
                            count_train_3 += 1
                    except:
                        print('抽取数据类别%s时出错!' % i)
                    else:
                        len_num += 1

        if count_train_3 >= 100:
            #抽取三级类训练数据集
            save_file(le_3_train_path + i[0] + '_train_count.txt',
                      i + '-->' + str(count_train_3) + ',', 'a')
            random.shuffle(train_3)
            write_datas(le_3_train_path + i[0] + '_train.txt', train_3)
            train_list.append(train_1)

            if temp_cate == i[0]:
                le3_count_cate += 1
            else:
                save_file(
                    train_path + temp_cate + '/' + temp_cate +
                    '_train_count.txt',
                    '类别数目' + '-->' + str(le3_count_cate) + ',', 'a')
                le3_count_cate = 1
            temp_cate = i[0]

        if count_test_3 > 35:
            #抽取三级类测试数据集
            save_file(le_3_test_path + i[0] + '_test_count.txt',
                      i + '-->' + str(count_test_3) + ',', 'a')
            random.shuffle(test_3)
            write_datas(le_3_test_path + i[0] + '_test.txt', test_3)
            test_list.append(test_1)

    #打乱数据,使同类别的数据分散
    for l1 in test_list:
        random.shuffle(l1)
        write_datas(test_path + 'level_' + le_n[0] + '_test.txt', l1)
    for l2 in train_list:
        random.shuffle(l2)
        write_datas(train_path + 'level_' + le_n[0] + '_train.txt', l2)

    save_file(train_path + temp_cate + '/' + temp_cate + '_train_count.txt',
              '类别数目' + '-->' + str(le3_count_cate) + ',', 'a')
Esempio n. 11
0
def select_txt(le_n,con,le_n_names,levs,train_path,test_path,stopword_path,lens):
    print('抽取数据,生成训练和测试数据集...')
    #分类的类别数量
    cate_count = 0
    #临时数据保存
    train_list = []
    test_list = []
    stopword = read_file(stopword_path,'utf').split('\n')
    for i in le_n_names:    #n级目录列表
        #类别数据个数统计
        count_train = 0
        count_test = 0
        #临时保存数据
        test = []
        train = []
        #训练文件集一级类名字
        # le1_name = i[0]
        len_num = 1
        
        if '/' in i:
            i=i.replace(r'/',' ')

        for j in enumerate(levs[:]): #原始数据分类号列表

            j_list=[]
            try:
                if ';' in j[1]:    #类别数大于1时
                    j_list = j[1].split(';')   
                else:
                    j_list.append(j[1])
            except:
                continue          
            for p in j_list:  
                p=str(p)  
                #print(p[0])
                if len(p) == 0:
                    continue
                # elif i[0] == p[0] and i in p: #符合n级目录则抽取
                elif len(i) >= 3 and len(p) >= 3 and i[:3] == p[:3]: #符合n级目录则抽取
                    index = j[0]   #获得待抽取数据索引
                    item = con.loc[index]
                    # title = item['标题']   #抽取数据标题
                    content = item['ContentText']  #抽取数据摘要,作为文本内容
                    # key_word = item['关键词'] #抽取数据关键词
                    # content = title+' '+content+' '+key_word
                    try:
                        # if len_num%8==0:
                        #     test.append(deal_datas(i,content,stopword))
                        #     count_test+=1
                        # else:
                        train.append(deal_datas(i,content,stopword))
                        count_train+=1
                    except:
                        print('抽取数据类别%s时出错!'%i)
                    else:
                        len_num+=1
        
        if count_train >= 20 :
            save_file(train_path+'train_'+lens+'_count.txt',i+'-->'+str(count_train)+',','a')
            train_list.append(train)
            cate_count += 1
        # if count_test >= 0:
        #     save_file(test_path+'test_'+lens+'_count.txt',i+'-->'+str(count_test)+',','a')
        #     test_list.append(test)

    #打乱数据,使得同类别的样本不至于扎堆
    for l1 in test_list:
        random.shuffle(l1)
        write_datas(test_path+'level_'+lens+'_test.txt',l1)
    for l2 in train_list:
        random.shuffle(l2)
        write_datas(train_path+'level_'+lens+'_train.txt',l2)
    
    save_file(train_path+'train_'+lens+'_count.txt','类别数目: '+str(cate_count)+',','a')
def train_merge_classifier(algorithm, train_merge_path, w2v_model,
                           model_save_path, result_save_path):
    # generate merge sub-dataset
    print('..生成融合训练集')
    train_start = time.time()
    train_merge_data = eval(read_file(train_merge_path))
    cate_list = list(train_merge_data.keys())
    class_number = len(cate_list)
    # 得到融合训练集
    train_merge_dic = get_merge_dataset(class_number, cate_list,
                                        train_merge_data)
    # 结果保存路径
    merge_result_save_path = result_save_path + 'merge_result/'
    if not os.path.exists(merge_result_save_path):
        os.makedirs(merge_result_save_path)

    # 融合模型保存路径
    merge_model_path = model_save_path + 'merge_model/'
    if not os.path.exists(merge_model_path):
        os.makedirs(merge_model_path)

    # train merge classifier
    # 加载二分类器映射文件
    # model_name_map = eval(read_file(result_save_path+algorithm+'_model_name_map.txt'))
    # model_name_map = json.loads(read_file(result_save_path+algorithm+'_model_name_map_json.txt'))   # 速度比上一行快

    # get sklearn classifier model
    clf = get_model(algorithm)
    # 加载w2v模型
    # w2v_model = load_w2v_model(w2v_model_path)
    # load binary models
    model_dic = load_binary_model(algorithm, class_number, cate_list,
                                  model_save_path)

    print('..训练')
    sum_score = 0.0
    all_less_str, all_result_str = '', ''
    for k in range(class_number):
        start_time = time.time()
        cur_cate = cate_list[k]
        item = train_merge_dic[cur_cate][0]
        # print(train_merge_dic[cur_cate][0])   [['text1','text2'],['R','-R']]
        con, labels = item[0], item[1]
        merge_size = len(labels)

        lb_list = []
        text_pro = []
        for label in labels:
            if label == cur_cate:
                lb_list.append(1)
            else:
                lb_list.append(0)
        pro_matrix = np.array([], [])
        for model in model_dic[cur_cate]:
            # clf = load_model(model)
            binary_model = model[1]
            # voc = model_name_map[model[0]]
            train_w2v = get_train_vec(con, w2v_model)
            # vectorizer = TfidfVectorizer(vocabulary=voc)
            # tdm = vectorizer.fit_transform(con)
            pred = binary_model.predict_proba(
                train_w2v)  # pred = clf.predict(tdm)
            for i in range(len(pred)):
                text_pro.append(pred[i][1])
        pro_matrix = np.array(text_pro).reshape((NUMBER, merge_size)).T
        '''
        lb_list = []
        all_text_pro = []
        for text,label in zip(con,labels):
            text_pro = []
            if label == cur_cate:
                lb_list.append(1)
            else:
                lb_list.append(0)
            for model in model_dic[cur_cate]:
                print(model)
                print(text)
                clf = load_model(model)
                voc = model_name_map[model]
                vectorizer = TfidfVectorizer(vocabulary=voc)
                tdm = vectorizer.fit_transform([text])
                pred = clf.predict_proba(tdm)   # pred = clf.predict(tdm)
                print(pred)
                text_pro.append(pred[0][1])
                print(pred[0][1])
                break
            all_text_pro.append(text_pro)
            # print(text_pro)
            break

        # print(len(all_text_pro))
        # print(all_text_pro[:5])
        '''
        # training
        x_train, x_test, y_train, y_test = train_test_split(pro_matrix,
                                                            lb_list,
                                                            test_size=0.3)
        classifier = clf.fit(pro_matrix, lb_list)
        pred = clf.predict(x_test)
        score = metrics.accuracy_score(y_test, pred)
        if score <= 0.85:
            all_less_str += cur_cate + ':' + str(score) + '\n'

        sum_score += score
        result_str = '%s merge classifier accuracy : %f\n' % (cur_cate,
                                                              round(score, 3))
        all_result_str += result_str
        print(result_str)
        save_file_lines(merge_result_save_path + algorithm + '_test.txt',
                        result_str, 'a')
        model_full_path = merge_model_path + cur_cate + '_' + algorithm + '_merge.model'
        save_model(classifier, model_full_path)

        end_time = time.time()
        print('one run time {}\n'.format(end_time - start_time))

    avg_score = sum_score / class_number
    avg_score_str = algorithm + ' merge classifier avg accuracy %f' % avg_score
    print(avg_score_str)

    train_end = time.time()
    train_run_time = round(train_end - train_start, 4)
    merge_run_time_str = 'merge train time: %f' % (train_run_time)
    print(merge_run_time_str)

    save_file_lines(merge_result_save_path + algorithm + '_score_less0.85.txt',
                    all_less_str, 'w')
    # save_file_lines(merge_result_save_path+algorithm+'_test.txt',all_result_str,'w')
    save_file_lines(merge_result_save_path + algorithm + '_test.txt',
                    avg_score_str + '\n' + merge_run_time_str, 'a')
def train_binary_classifier(algorithm, train_path, model_save_path,
                            result_save_path):
    # 获取类别列表
    # 加载数据
    train_dic = json.loads(read_file(train_path))
    cate_list = list(train_dic.keys())
    class_number = len(cate_list)
    train_start = time.time()

    result_save_full_path = result_save_path + 'binary_result/'
    if not os.path.exists(result_save_full_path):
        os.makedirs(result_save_full_path)

    # 加载模型
    clf = get_model(algorithm)

    print('..训练分类器')
    all_score = 0.0
    results, less_score_cate = '', ''
    model_name_map = {}  #训练文本映射字典
    for cate in cate_list:
        sum_score = 0.0
        if not os.path.exists(model_save_path + cate + '/'):
            os.makedirs(model_save_path + cate + '/')
        for i in range(NUMBER):
            train, label = train_dic[cate][i][0], train_dic[cate][i][1]
            x_train, x_test, y_train, y_test = train_test_split(train,
                                                                label,
                                                                test_size=0.2)

            classifier = clf.fit(x_train, y_train)
            pred = clf.predict(x_test)
            score = metrics.accuracy_score(y_test, pred)
            sum_score += score
            # save_model
            model_full_path = model_save_path + cate + '/' + algorithm + '_' + cate + '_' + str(
                i + 1) + '.model'
            save_model(classifier, model_full_path)
            # model_name_map[model_full_path] = train_dic[cate][i][2]
        avg_score = sum_score / NUMBER
        all_score += sum_score
        if avg_score <= 0.85:
            save_file_lines(
                result_save_full_path + algorithm + '_score_less0.85.txt',
                str(cate) + '\n', 'a')
        result_str = "%s avg-accuracy:   %0.3f " % (cate, score)
        print(result_str)
        results = result_str + '\n'
        save_file_lines(result_save_full_path + algorithm + '_test.txt',
                        results, 'a')
        # break
    all_avg = all_score / (NUMBER * class_number)
    all_result_str = "%s all avg-accuracy:   %0.3f \n" % (algorithm, all_avg)
    print(all_result_str)

    train_end = time.time()
    train_run_time = round(train_end - train_start, 4)
    run_time_str = algorithm + ' train time: %f' % (train_run_time)
    print(run_time_str)

    save_file_lines(result_save_full_path + algorithm + '_test.txt',
                    all_result_str + '\n' + run_time_str, 'a')
Esempio n. 14
0
def test(algorithm,test_final_path,result_save_path,model_save_path,w2v_model,skip_word_save_path,fasttext_train):
    # 读取测试集文件
    test_final_data = eval(read_file(test_final_path))
    cate_list = list(test_final_data.keys())
    class_number = len(cate_list)
    test_start = time.time()
    
    # 加载二分类器映射文件
    # model_name_map = eval(read_file(result_save_path+algorithm+'_model_name_map_json.txt'))
    # model_name_map = json.loads(read_file(result_save_path+algorithm+'_model_name_map_json.txt'))   # 速度比上一行快

    all_len,all_right = 0,0
    result_path = result_save_path+'final_test'+'/'
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    record_path = result_path+'records_way_le1_le2_w2v/'
    if not os.path.exists(record_path):
        os.makedirs(record_path)

    test_result_path = result_path+algorithm+'_test_way_le1_le2_w2v.txt'
    
    # 得到测试文本和标签
    test_data_dic = get_test_dataset(class_number,cate_list,test_final_data)

    # 测试过程
    # 加载模型
    load_model_time = time.time()
    # load fasttext model(level_1)
    le1_model = 'level_1/level_1_fasttext_classifier_big_big.model'
    le1_fasttext_model = fasttext_train.load_fasttext(le1_model)
    le2_model = 'level_2/level_2_fasttext_classifier_big_big.model'
    le2_fasttext_model = fasttext_train.load_fasttext(le2_model)


    # get sklearn classifier model
    # clf = get_model(algorithm)
    # load binary models
    print('..加载二分类器模型')
    model_dic = load_binary_model(algorithm,class_number,cate_list,model_save_path)
    load_end_time = time.time()
    print('加载模型用时:{}'.format(load_end_time-load_model_time))
    # load merge model
    model_merge_dic = load_merge_model(algorithm,model_save_path+'merge_model/')
    # 加载w2v模型
    # w2v_model = load_w2v_model(w2v_model_path)

    # read KB
    # kb_dic = get_KB_dic(skip_word_save_path)

    for cate,cons in test_data_dic.items():
        test_one_time = time.time()
        cur_cate = cate
        print('cur cate %s'%cur_cate)

        # 定义TP,FP,TN,FN
        # tp_num = 0
        right = 0
        texts,labels = cons[0],cons[1]
        
        test_size = len(labels)
        all_len += test_size
        
        # 一级类目预测/二级
        level_1_pre_result = fasttext_train.test_model(texts,le1_fasttext_model)
        level_2_pre_result = fasttext_train.test_model(texts,le2_fasttext_model)
        level_1_pre_labels_list,level_2_pre_labels_list = [],[]
        for le1 in level_1_pre_result:
            label_list_le1 = le1[0][:-1].split('__')
            level_1_pre_labels_list.append(label_list_le1[2])

        for le2 in level_2_pre_result:
            label_list_le2 = le2[0][:-1].split('__')
            level_2_pre_labels_list.append(label_list_le2[2])

        # print(level_2_pre_labels_list)
        # 知识库预测
        # text_list = [text.split() for text in texts]
        # kb_labels_list = get_level_3_from_KB(kb_dic,text_list)
        
        # 记录
        text_pre_results = {}
        for i in range(test_size):
            text,label = texts[i],labels[i]
            text_pre_results[label+'\t'+text] = []

            # text_kb_label = []
            # text_kb_label = kb_labels_list[i]
            text_le2_label = []
            text_le1_label = level_1_pre_labels_list[i]
            text_le2_label = level_2_pre_labels_list[i]
            le_flag = 0
            if text_le2_label[0] == text_le1_label:
                le_flag = True
            # if label not in text_kb_label:  # 知识库预测结果未出现文档原始标签,直接跳过       way_5未跳过
            #     continue
            pre_result_dic = {}
            for bin_cate,models in model_dic.items():
                skip_flag = 0
                # if bin_cate not in text_kb_label:
                #     continue
                if not le_flag and bin_cate[0] in text_le1_label:
                    skip_flag = 1
                if le_flag and bin_cate[:2] in text_le2_label:
                    skip_flag = 2
                # if bin_cate[0] in text_le1_label:
                #     skip_flag = 3
                if skip_flag: 
                    text_pro = []
                    pre_result_dic[bin_cate] = []
                    for model in models:
                        # clf = load_model(model)
                        clf = model[1]
                        train_w2v = get_train_vec([text],w2v_model,skip_word_save_path)
                        # voc = model_name_map[model[0]]
                        # vectorizer = TfidfVectorizer(vocabulary=voc)
                        # tdm = vectorizer.fit_transform([text])
                        pred = clf.predict_proba(train_w2v)
                        text_pro.append(pred[0][1])
                        
                    for c,md in model_merge_dic.items():
                        if bin_cate in md:
                            # print('load merge model %s'%mer_md)
                            merge_model = load_model(md)
                            pre = merge_model.predict_proba([text_pro])
                            pre_result_dic[bin_cate].append(pre[0][1])
                    
                
            sort_pre_tuple = sorted(pre_result_dic.items(), key=lambda d:d[1],reverse=True)
            sort_cate_len = len(sort_pre_tuple)
            pre_cate_list = []
            for pre_cate in sort_pre_tuple:
                pre_cate_list.append(pre_cate[0])
            pre_len = len(pre_cate_list)
            if pre_len >=3:
                if label in pre_cate_list[:3]:
                    right += 1
                    all_right += 1
            elif label in pre_cate_list:
                right += 1
                all_right += 1

            text_pre_results[label+'\t'+text].append([text_le1_label,text_le2_label,pre_cate_list[:10]])
            
        acc = right/test_size
        print('acc %f'%acc) # 这种方法的准确率为0.21,6h--way_1  不加入fasttext; 加入fasttext效果提高到0.23/way_3,way_4
        save_file_lines(record_path+cur_cate+'_svm_result_record_way_le1_le2_w2v.txt',text_pre_results,'w')
        '''
        right = 0
        # doc_dic = {}
        doc_dic = {i:[] for i in range(test_size)}
        for bin_cate,models in model_dic.items():
            if bin_cate[0] not in level_2_pre_labels_list:
                continue
            text_pro = []
            pro_matrix = np.array([],[])
            # print('load binary model %s'%bin_cate)
            for model in models:
                # clf = load_model(model)
                clf = model[1]
                voc = model_name_map[model[0]]
                vectorizer = TfidfVectorizer(vocabulary=voc)
                tdm = vectorizer.fit_transform(texts)
                pred = clf.predict_proba(tdm)
                for i in range(len(pred)):
                    text_pro.append(pred[i][1])
            pro_matrix = np.array(text_pro).reshape((NUMBER,test_size)).T
            temp_pro = []
            for c,md in model_merge_dic.items():
                if bin_cate in md:
                    # print('load merge model %s'%mer_md)
                    merge_model = load_model(md)
                    pre = merge_model.predict_proba(pro_matrix)
                    for j in range(len(pre)):
                        doc_dic[j].append([pre[j][1],c])
        # print(doc_dic)
        # 不加入KB和层级分类
        
        for doc,pro_list in doc_dic.items():
            pro_sort = sorted(pro_list,key=lambda d:d[0], reverse = True)
            pre_cate = [pro_sort[0][1],pro_sort[1][1],pro_sort[2][1]]       #选择top3预测的类别,,准去率0.40+
            if cur_cate in pre_cate:      #  旧方法 :pro_sort[0][1] == cur_cate     
                right += 1
                all_right += 1
        
        # 结果排序,并和知识库结果求交集
        
        for doc,pro_list in doc_dic.items():
            sort_pre_tuple = sorted(pro_list,key=lambda d:d[0], reverse = True)
            sort_cate_len = len(sort_pre_tuple)
            pre_cate_list = []
            for pre_cate in sort_pre_tuple:
                pre_cate_list.append(pre_cate[1])
            if len(pre_cate_list) >=3:
                if label in pre_cate_list[:3]:
                    right += 1
                    all_right += 1
            elif label in pre_cate_list:
                right += 1
                all_right += 1
            # 和知识库结果求交集
            
            kb_pre_cate = []
            for cate in pre_cate_list:
                if cate in pro_list[0]:
                    kb_pre_cate.append(cate)
            kb_pre_len = len(kb_pre_cate)
            if kb_pre_len >=3:
                if cur_cate in kb_pre_cate[:3]:
                    right += 1
                    all_right += 1
            elif cur_cate in kb_pre_cate:
                right += 1
                all_right += 1
            
        acc = right/test_size
        print('acc %f'%acc)
        '''
        
        # if acc <= 0.4:
        #     save_file_lines(result_path+algorithm+'_less_0.2_way_0.txt',cur_cate+': '+str(acc)+'\n','a')
        test_one_end_time = time.time()
        one_run_time = round(test_one_end_time-test_one_time,4)
        print('test one cate time:%f\n'%one_run_time)

        save_file_lines(test_result_path,cur_cate+' dataset accuracy :%f'%acc+'\n','a')
        
    print('macro acc %f'%(all_right/all_len))
    test_end = time.time()
    test_run_time = round(test_end-test_start,4)
    print('merge test time: %f'%(test_run_time))

    save_file_lines(test_result_path,'using '+algorithm+' micro acc %f'%(all_right/all_len)+'\n','a')
    save_file_lines(test_result_path,'merge test time: %f'%(test_run_time),'a')
    
Esempio n. 15
0
def test(algorithm, test_final_data, result_save_path, model_save_path):
    cate_list = list(test_final_data.keys())
    class_number = len(cate_list)
    test_start = time.time()
    # load model
    model_dic = {}
    model_merge_dic = {}
    for j in range(class_number):
        cur_cate = cate_list[j]
        model_dic[cur_cate] = []
        model_path = model_save_path + cur_cate + '/'
        models = os.listdir(model_path)
        for model in models:
            if algorithm in model:  # svm
                model_full_path = model_path + model
                model_dic[cur_cate].append(model_full_path)
    for model in os.listdir(model_save_path):
        if algorithm in model:
            cate = model.split('_')[0]
            model_merge_dic[cate] = load_model(model_save_path + model)

    model_name_map = eval(
        read_file(result_save_path + algorithm + '_model_name_map.txt'))

    all_right = 0
    all_len = 0
    result_path = result_save_path + 'final_test' + '/'
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    test_result_path = result_path + algorithm + '_test.txt'

    test_datas = {}
    error_cate = {}
    classify_cate = {}
    # 得到测试文本和标签
    for i in range(class_number):
        cur_cate = cate_list[i]
        contents, labels = get_dataset(test_final_data[cur_cate], cur_cate)
        test_datas[cur_cate] = [contents, labels]
    # 测试过程
    for cate, cons in test_datas.items():
        test_one_time = time.time()
        cur_cate = cate
        print('cur cate %s' % cur_cate)
        texts, labels = cons[0], cons[1]
        right = 0
        doc_dic = {}
        error_cate[cate] = []
        classify_cate[cate] = []

        test_size = len(labels)
        all_len += test_size

        doc_dic = {i: [] for i in range(test_size)}
        for bin_cate, models in list(model_dic.items()):
            # if not (bin_cate == 'A81' or bin_cate == 'B08' or bin_cate == 'D80'):
            #     continue
            text_pro = []
            pro_matrix = np.array([], [])
            # print('load binary model %s'%bin_cate)
            for model in models:
                clf = load_model(model)
                voc = model_name_map[model]
                vectorizer = TfidfVectorizer(vocabulary=voc)
                tdm = vectorizer.fit_transform(texts)
                pred = clf.predict_proba(tdm)
                for i in range(len(pred)):
                    text_pro.append(pred[i][1])
            pro_matrix = np.array(text_pro).reshape((NUMBER, test_size)).T

            temp_pro = []
            for c, md in model_merge_dic.items():
                if bin_cate == c:
                    pre = md.predict_proba(pro_matrix)
                    for j in range(len(pre)):
                        doc_dic[j].append([pre[j][1], c])

        # print(doc_dic)
        for doc, pro_list in doc_dic.items():
            pro_sort = sorted(pro_list, key=lambda d: d[0], reverse=True)
            pre_cate = [pro_sort[0][1], pro_sort[1][1],
                        pro_sort[2][1]]  #选择top3预测的类别,,准去率0.40+
            if cur_cate in pre_cate:  #  旧方法 :pro_sort[0][1] == cur_cate
                right += 1
                all_right += 1
            else:
                error_cate[cate].append(pre_cate)
            classify_cate[cate].append(pre_cate)
            doc_dic[doc] = pro_sort[0][1]

        # print(doc_dic[2])
        acc = right / test_size
        print('acc %f' % acc)
        test_one_end_time = time.time()
        one_run_time = round(test_one_end_time - test_one_time, 4)
        print('test one cate time:%f\n' % one_run_time)

        save_file_lines(test_result_path,
                        cur_cate + ' dataset accuracy :%f' % acc + '\n', 'a')
        save_file_lines(
            result_save_path + 'final_test/' + algorithm + '_all.txt',
            '\n' + cate + '\n' + str(classify_cate[cate]), 'a')
        save_file_lines(
            result_save_path + 'final_test/' + algorithm + '_error.txt',
            '\n' + cate + '\n' + str(error_cate[cate]), 'a')
        break