Exemple #1
0
def load_tt_datas(config={}, reload=True):
    '''
    loda data.
    config: 获得需要加载的数据类型,放入pre_embedding.
    nload: 是否重新解析原始数据
    '''
    if reload:
        print "reload the datasets."
        if config['dataset'] == 'lap':
            train_data, test_data, word2idx = load_data(
                lap_train_path, lap_test_path, class_num=config['class_num'])

        elif config['dataset'] == 'res':
            train_data, test_data, word2idx = load_data(
                res_train_path, res_test_path, class_num=config['class_num'])
        elif config['dataset'] == 'dong':
            train_data, test_data, word2idx = load_data_dong(
                dong_train_path, dong_test_path, class_num=config['class_num'])

        emb_dict = load_glove(pre_train,
                              word2idx,
                              init_std=config['emb_stddev'])

        sswe_dict = load_ssweu(sswe_path,
                               word2idx,
                               init_std=config['emb_stddev'])

        rand_dict = load_random('', word2idx, init_std=config['emb_stddev'])
        # dump the pretreatment data.
        if config['class_num'] == 2:
            path = 'datas/mid_data_2classes/'
        else:
            path = 'datas/mid_data_3classes/'
        if config['dataset'] == 'lap':
            dump_file([train_data, path + mid_lap_train_data],
                      [test_data, path + mid_lap_test_data],
                      [emb_dict, path + mid_lap_emb_dict],
                      [word2idx, path + mid_lap_word2idx],
                      [sswe_dict, path + mid_lap_sswe_dict],
                      [rand_dict, path + mid_lap_rand_dict])
        elif config['dataset'] == 'res':
            dump_file([train_data, path + mid_res_train_data],
                      [test_data, path + mid_res_test_data],
                      [emb_dict, path + mid_res_emb_dict],
                      [word2idx, path + mid_res_word2idx],
                      [sswe_dict, path + mid_res_sswe_dict],
                      [rand_dict, path + mid_res_rand_dict])
        elif config['dataset'] == 'dong':
            dump_file([train_data, path + mid_dong_train_data],
                      [test_data, path + mid_dong_test_data],
                      [emb_dict, path + mid_dong_emb_dict],
                      [word2idx, path + mid_dong_word2idx],
                      [sswe_dict, path + mid_dong_sswe_dict],
                      [rand_dict, path + mid_dong_rand_dict])
    else:
        print "not reload the datasets."
        if config['class_num'] == 2:
            path = 'datas/mid_data_2classes/'
        else:
            path = 'datas/mid_data_3classes/'

        if config['dataset'] == 'lap':
            datas = load_file(path + mid_lap_train_data,
                              path + mid_lap_test_data,
                              path + mid_lap_emb_dict,
                              path + mid_lap_sswe_dict,
                              path + mid_lap_rand_dict)
        elif config['dataset'] == 'res':
            datas = load_file(path + mid_res_train_data,
                              path + mid_res_test_data,
                              path + mid_res_emb_dict,
                              path + mid_res_sswe_dict,
                              path + mid_res_rand_dict)
        elif config['dataset'] == 'dong':
            datas = load_file(path + mid_dong_train_data,
                              path + mid_dong_test_data,
                              path + mid_dong_emb_dict,
                              path + mid_dong_sswe_dict,
                              path + mid_dong_rand_dict)

        train_data = datas[0]
        test_data = datas[1]
        emb_dict = datas[2]
        sswe_dict = datas[3]
        rand_dict = datas[4]
    config['pre_embedding'] = emb_dict
    config['sswe_embedding'] = sswe_dict
    config['rand_embedding'] = rand_dict
    return train_data, test_data
Exemple #2
0
def load_tt_datas(config={}, reload=True):
    '''
    loda data.
    config: 获得需要加载的数据类型,放入pre_embedding.
    nload: 是否重新解析原始数据
    '''

    if reload:
        print( "reload the datasets.")
        print (config['dataset'])

        if config['dataset'] == 'home':
            train_data, test_data, item2idx, n_items = load_data_p(home_train, home_test, pro = None)
            config["n_items"] = n_items-1
            emb_dict = load_random(item2idx,edim=config['hidden_size'], init_std=config['emb_stddev'])
            config['pre_embedding'] = emb_dict
            path = 'datas/mid_data'
            dump_file([emb_dict, path+home_emb_dict])
            print("-----")

        if config['dataset'] == 'rsc15_4':
            train_data, test_data, item2idx, n_items = load_data_p(
                rsc15_train,
                rsc15_test,
                pro = 4
            )

            config["n_items"] = n_items-1
            emb_dict = load_random(item2idx,edim=config['hidden_size'], init_std=config['emb_stddev'])
            config['pre_embedding'] = emb_dict
            path = 'datas/mid_data'
            dump_file([emb_dict, path+mid_rsc15_4_emb_dict])
            print("-----")

        if config['dataset'] == 'rsc15_64':
            train_data, test_data, item2idx, n_items = load_data_p(
                rsc15_train,
                rsc15_test,
                pro = 64
            )

            config["n_items"] = n_items-1
            emb_dict = load_random(item2idx, edim=config['hidden_size'], init_std=config['emb_stddev'])
            config['pre_embedding'] = emb_dict
            path = 'datas/mid_data'
            dump_file([emb_dict, path + mid_rsc15_64_emb_dict])
            print("-----")

        if config['dataset'] == 'cikm16':
            train_data, test_data, item2idx, n_items = load_data2(
                cikm16_train,
                cikm16_test,
                class_num=config['class_num']
            )
            config["n_items"] = n_items-1
            emb_dict = load_random(item2idx,edim=config['hidden_size'], init_std=config['emb_stddev'])
            config['pre_embedding'] = emb_dict
            path = 'datas/mid_data'
            dump_file([emb_dict, path+mid_cikm16_emb_dict])
            print("-----")

    else:
        print ("not reload the datasets.")
        print(config['dataset'])

        if config['dataset'] == 'rsc15_4':
            train_data, test_data, item2idx, n_items = load_data_p(
                rsc15_train,
                rsc15_test,
                pro=4
            )

            config["n_items"] = n_items-1
            path = 'datas/mid_data'
            emb_dict = load_file(path + mid_rsc15_4_emb_dict)
            config['pre_embedding'] = emb_dict[0]
            # path = 'datas/mid_data'
            # dump_file([emb_dict, path+mid_rsc15_emb_dict])
            print("-----")

        if config['dataset'] == 'rsc15_64':
            train_data, test_data, item2idx, n_items = load_data_p(
                rsc15_train,
                rsc15_test,
                pro=64
            )

            config["n_items"] = n_items-1
            # emb_dict = load_random(n_items, edim=config['hidden_size'], init_std=config['emb_stddev'])
            # path = 'datas/train_emb/'
            # emb_dict = load_file(path + "rsc15_64_emb.data")
            path = 'datas/mid_data'
            emb_dict = load_file(path+mid_rsc15_64_emb_dict)
            config['pre_embedding'] = emb_dict[0]

            # dump_file([emb_dict, path + mid_rsc15_emb_dict])
            print("-----")

        if config['dataset'] == 'cikm16':
            train_data, test_data, item2idx, n_items = load_data2(
                cikm16_train,
                cikm16_test,
                class_num=config['class_num']
            )
            config["n_items"] = n_items-1
            path = 'datas/mid_data'
            emb_dict = load_file(path + mid_cikm16_emb_dict)
            # path = 'datas/train_emb/'
            # emb_dict = load_file(path + "cikm16_emb.data")
            config['pre_embedding'] = emb_dict[0]
            print("-----")

    return train_data, test_data
Exemple #3
0
def load_tt_datas(config={}, reload=True):
    '''
    loda data.
    config: 获得需要加载的数据类型,放入pre_embedding.
    nload: 是否重新解析原始数据
    '''

    if reload:
        print("reload the datasets.")
        print(config['dataset'])
        if config['dataset'] == 'rsc15':
            train_data, test_data, item2idx = load_data(
                rsc15_train, rsc15_test, class_num=config['class_num'])
            data = pd.read_csv(rsc15_train,
                               sep='\t',
                               dtype={'ItemId': np.int64})
            itemids = data["ItemId"].unique()  # 去重,return 唯一的itemid序列
            n_items = len(itemids)
            config["n_items"] = n_items
            emb_dict = load_random(item2idx,
                                   edim=config['hidden_size'],
                                   init_std=config['emb_stddev'])
            config['pre_embedding'] = emb_dict
            path = 'datas/mid_data'
            dump_file([emb_dict, path + mid_rsc15_emb_dict])
            print("-----")

        if config['dataset'] == 'rsc15_4':
            train_data, test_data, item2idx, n_items = load_data_p(rsc15_train,
                                                                   rsc15_test,
                                                                   pro=4)

            config["n_items"] = n_items - 1
            emb_dict = load_random(item2idx,
                                   edim=config['hidden_size'],
                                   init_std=config['emb_stddev'])
            config['pre_embedding'] = emb_dict
            path = 'datas/mid_data'
            dump_file([emb_dict, path + mid_rsc15_4_emb_dict])
            print("-----")

        if config['dataset'] == 'rsc15_64':
            train_data, test_data, item2idx, n_items = load_data_p(rsc15_train,
                                                                   rsc15_test,
                                                                   pro=64)

            config["n_items"] = n_items - 1
            emb_dict = load_random(item2idx,
                                   edim=config['hidden_size'],
                                   init_std=config['emb_stddev'])
            config['pre_embedding'] = emb_dict
            path = 'datas/mid_data'
            dump_file([emb_dict, path + mid_rsc15_64_emb_dict])
            print("-----")

        if config['dataset'] == 'cikm16':
            train_data, test_data, item2idx, n_items = load_data2(
                cikm16_train, cikm16_test, class_num=config['class_num'])
            config["n_items"] = n_items - 1
            emb_dict = load_random(item2idx,
                                   edim=config['hidden_size'],
                                   init_std=config['emb_stddev'])
            config['pre_embedding'] = emb_dict
            path = 'datas/mid_data'
            dump_file([emb_dict, path + mid_cikm16_emb_dict])
            print("-----")

    else:
        print("not reload the datasets.")
        print(config['dataset'])
        if config['dataset'] == 'rsc15':
            train_data, test_data, item2idx = load_data(
                rsc15_train, rsc15_test, class_num=config['class_num'])
            data = pd.read_csv(rsc15_train,
                               sep='\t',
                               dtype={'ItemId': np.int64})
            itemids = data["ItemId"].unique()  # 去重,return 唯一的itemid序列
            n_items = len(itemids)
            config["n_items"] = n_items

            path = 'datas/mid_data'
            emb_dict = load_file(path + mid_rsc15_emb_dict)
            config['pre_embedding'] = emb_dict[0]

            print("-----")

        if config['dataset'] == 'rsc15_4':
            train_data, test_data, item2idx, n_items = load_data_p(rsc15_train,
                                                                   rsc15_test,
                                                                   pro=4)

            config["n_items"] = n_items - 1
            path = 'datas/mid_data'
            emb_dict = load_file(path + mid_rsc15_4_emb_dict)
            config['pre_embedding'] = emb_dict[0]
            # path = 'datas/mid_data'
            # dump_file([emb_dict, path+mid_rsc15_emb_dict])
            print("-----")

        if config['dataset'] == 'rsc15_64':
            train_data, test_data, item2idx, n_items = load_data_p(rsc15_train,
                                                                   rsc15_test,
                                                                   pro=64)

            config["n_items"] = n_items - 1
            # emb_dict = load_random(n_items, edim=config['hidden_size'], init_std=config['emb_stddev'])
            # path = 'datas/train_emb/'
            # emb_dict = load_file(path + "rsc15_64_emb.data")
            path = 'datas/mid_data'
            emb_dict = load_file(path + mid_rsc15_64_emb_dict)
            config['pre_embedding'] = emb_dict[0]

            # dump_file([emb_dict, path + mid_rsc15_emb_dict])
            print("-----")

        if config['dataset'] == 'cikm16':
            train_data, test_data, item2idx, n_items = load_data2(
                cikm16_train, cikm16_test, class_num=config['class_num'])
            config["n_items"] = n_items - 1
            path = 'datas/mid_data'
            emb_dict = load_file(path + mid_cikm16_emb_dict)
            # path = 'datas/train_emb/'
            # emb_dict = load_file(path + "cikm16_emb.data")
            config['pre_embedding'] = emb_dict[0]
            print("-----")
        # if config['dataset'] == 'rsc15':
        #     train_data = pd.read_csv(rsc15_train, sep='\t', dtype={'ItemId': np.int64})
        #     test_data = pd.read_csv(rsc15_test, sep='\t', dtype={'ItemId': np.int64})

        # path = 'datas/mid_data/'
        # sample_list = []
        # for i in range(10):
        #     tmp = load_file(
        #         path +str(i)+ "_"+mid_rsc15_train_data
        #     )
        #     sample_list+= tmp[0]
        # sample_pack = Samplepack()
        # sample_pack.samples = sample_list
        # sample_pack.init_id2sample()
        # train_data = sample_pack
        # ret= load_file(
        #         path +mid_rsc15_test_data
        #     )
        # test_data = ret[0]

    return train_data, test_data