Beispiel #1
0
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)

    return string


# load data
data = DATA(args, tokenizer)
train_iter = data.train_iter
test_iter = data.test_iter

# vocab
wordvocab = data.TEXT.vocab.itos

# full vocab
word_dic_full = {}
word_invdic_full = {}
for ii, ww in enumerate(wordvocab):
    word_dic_full[ww] = ii
    word_invdic_full[ii] = ww

args.embed_num = len(data.TEXT.vocab)
args.class_num = len(data.LABEL.vocab)
Beispiel #2
0
import pickle
from load_data import DATA
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.manifold import TSNE
from scipy.stats import kurtosis


DATA_RED = {key: {} for key in DATA}

for key, val in DATA.items():
    print(f"Transforming {key} data...")
    X, y = val
    # pca
    pca = PCA(n_components=0.95, whiten=True, random_state=0)
    X_pca = pca.fit_transform(X - X.mean())
    DATA_RED[key]["pca"] = X_pca, y
    # ica
    ica = FastICA(n_components=X_pca.shape[1], whiten=True, random_state=0)
    X_ica = ica.fit_transform(X)
    DATA_RED[key]["ica"] = X_ica, y
    # rca
    rca = GaussianRandomProjection(n_components=X_pca.shape[1], random_state=0)
    X_rca = rca.fit_transform(X)
    kurt = kurtosis(X_ica)
    kurt_rank = (-kurt).argsort()

    DATA_RED[key]["rca"] = X_rca[:, kurt_rank], y
    # tsne
    tsne = TSNE(n_components=3, random_state=0, n_jobs=-1)
Beispiel #3
0
        params.data_name = dataset

    if dataset in {"synthetic"}:
        params.n_question = 50
        params.batch_size = 128
        params.seqlen = 200
        params.data_dir = '../dataset/' + dataset
        params.data_name = 'naive_c5_q50_s4000_v0'

    params.save = params.data_name
    params.load = params.data_name

    # Setup
    if "pid" not in params.data_name:
        dat = DATA(n_question=params.n_question,
                   seqlen=params.seqlen,
                   separate_char=',')
    else:
        dat = PID_DATA(n_question=params.n_question,
                       seqlen=params.seqlen,
                       separate_char=',')
    seedNum = params.seed
    np.random.seed(seedNum)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seedNum)
    np.random.seed(seedNum)
    file_name_identifier = get_file_name_identifier(params)

    ###Train- Test
    d = vars(params)
Beispiel #4
0
    params.lr = params.init_lr
    params.memory_key_state_dim = params.q_embed_dim
    params.memory_value_state_dim = params.qa_embed_dim

    params.dataset = dataset
    if not params.gpus:
        ctx = mx.cpu()
        print("Training with cpu ...")
    else:
        ctx = mx.gpu(int(params.gpus))
        print("Training with gpu(" + params.gpus + ") ...")
    params.ctx = ctx

    # Read data
    dat = DATA(n_question=params.n_question,
               seqlen=params.seqlen,
               separate_char=',')

    seedNum = 224
    np.random.seed(seedNum)
    if not params.test:
        params.memory_key_state_dim = params.q_embed_dim
        params.memory_value_state_dim = params.qa_embed_dim
        d = vars(params)
        for key in d:
            print('\t', key, '\t', d[key])
        file_name = 'b' + str(params.batch_size) + \
                    '_q' + str(params.q_embed_dim) + '_qa' + str(params.qa_embed_dim) + \
                    '_m' + str(params.memory_size) + '_std' + str(params.init_std) + \
                    '_lr' + str(params.init_lr) + '_gn' + str(params.maxgradnorm) + \
                    '_f' + str(params.final_fc_dim) + '_s' + str(seedNum)
def get_auc(fold_num):
    # Parse Arguments
    parser = argparse.ArgumentParser(description='Script to test KT')
    # Basic Parameters
    parser.add_argument('--max_iter',
                        type=int,
                        default=500,
                        help='number of iterations')
    parser.add_argument('--train_set', type=int, default=fold_num)
    parser.add_argument('--seed', type=int, default=224, help='default seed')

    # Common parameters
    parser.add_argument('--optim',
                        type=str,
                        default='adam',
                        help='Default Optimizer')
    parser.add_argument('--batch_size',
                        type=int,
                        default=24,
                        help='the batch size')
    parser.add_argument('--lr', type=float, default=1e-5, help='learning rate')
    parser.add_argument('--maxgradnorm',
                        type=float,
                        default=-1,
                        help='maximum gradient norm')
    parser.add_argument('--final_fc_dim',
                        type=int,
                        default=512,
                        help='hidden state dim for final fc layer')

    # AKT Specific Parameter
    parser.add_argument('--d_model',
                        type=int,
                        default=256,
                        help='Transformer d_model shape')
    parser.add_argument('--d_ff',
                        type=int,
                        default=1024,
                        help='Transformer d_ff shape')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.05,
                        help='Dropout rate')
    parser.add_argument('--n_block',
                        type=int,
                        default=1,
                        help='number of blocks')
    parser.add_argument('--n_head',
                        type=int,
                        default=8,
                        help='number of heads in multihead attention')
    parser.add_argument('--kq_same', type=int, default=1)

    # AKT-R Specific Parameter
    parser.add_argument('--l2',
                        type=float,
                        default=1e-5,
                        help='l2 penalty for difficulty')

    # DKVMN Specific  Parameter
    parser.add_argument('--s_embed_dim',
                        type=int,
                        default=50,
                        help='question embedding dimensions')
    parser.add_argument('--sa_embed_dim',
                        type=int,
                        default=256,
                        help='skill-response embedding dimensions')
    parser.add_argument('--memory_size',
                        type=int,
                        default=50,
                        help='memory size')
    parser.add_argument('--init_std',
                        type=float,
                        default=0.1,
                        help='weight initialization std')
    # DKT Specific Parameter
    parser.add_argument('--hidden_dim', type=int, default=512)
    parser.add_argument('--lamda_r', type=float, default=0.1)
    parser.add_argument('--lamda_w1', type=float, default=0.1)
    parser.add_argument('--lamda_w2', type=float, default=0.1)

    # Datasets and Model
    parser.add_argument(
        '--model',
        type=str,
        default='akt_eid',
        help="combination of akt, eid (mandatory) separated by underscore '_'."
    )
    parser.add_argument('--dataset', type=str, default='assist2009_eid')

    params = parser.parse_args()
    dataset = params.dataset

    if dataset in {'assist2009_eid'}:
        params.batch_size = 24
        params.seqlen = 400
        params.data_dir = 'data/' + dataset
        params.data_name = dataset
        params.n_skill = 124
        params.n_eid = 26688
        params.n_tid = 214  #maximum true response count in past
        params.n_fid = 214  #maximum false response count in past
        params.n_xid = 0
        params.n_yid = 0

    if dataset in {'assist2017_eid'}:
        params.batch_size = 24
        params.seqlen = 200
        params.data_dir = 'data/' + dataset
        params.data_name = dataset
        params.n_skill = 102
        params.n_eid = 3162
        params.n_tid = 12  #maximum true response count in past
        params.n_fid = 90  #maximum false response count in past
        params.n_xid = 0
        params.n_yid = 0

    params.save = params.data_name
    params.load = params.data_name

    # Setup
    if 'eid' in params.data_name:
        dat = EID_DATA(n_skill=params.n_skill,
                       seqlen=params.seqlen,
                       separate_char=',')
    else:
        dat = DATA(n_skill=params.n_skill,
                   seqlen=params.seqlen,
                   separate_char=',')

    seedNum = params.seed
    np.random.seed(seedNum)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seedNum)
    np.random.seed(seedNum)
    file_name_identifier = get_file_name_identifier(params)

    ###Train- Test
    d = vars(params)
    for key in d:
        print('\t', key, '\t', d[key])

    #model path
    file_name = ''
    for item_ in file_name_identifier:
        file_name = file_name + item_[0] + str(item_[1])

    train_data_path = params.data_dir + '/' + \
        params.data_name + '_train' + str(params.train_set) + '.csv'
    valid_data_path = params.data_dir + "/" + \
        params.data_name + '_valid' + str(params.train_set) + '.csv'

    train_s_data, train_sa_data, train_eid, train_tid, train_fid, train_xid, train_yid = dat.load_data(
        train_data_path)
    valid_s_data, valid_sa_data, valid_eid, valid_tid, valid_fid, valid_xid, valid_yid = dat.load_data(
        valid_data_path)

    print('\n')
    print('train_s_data.shape', train_s_data.shape)
    print('train_sa_data.shape', train_sa_data.shape)
    print('train_eid.shape', train_eid.shape)
    print('train_tid.shape', train_tid.shape)
    print('train_fid.shape', train_fid.shape)
    print('valid_s_data.shape', valid_s_data.shape)
    print('valid_sa_data.shape', valid_sa_data.shape)
    print('valid_eid.shape', valid_eid.shape)
    print('valid_tid.shape', valid_tid.shape)
    print('valid_fid.shape', valid_fid.shape)
    print('\n')
    # Train and get the best episode
    best_epoch = train_one_dataset(params, file_name, train_s_data,
                                   train_sa_data, train_eid, train_tid,
                                   train_fid, train_xid, train_yid,
                                   valid_s_data, valid_sa_data, valid_eid,
                                   valid_tid, valid_fid, valid_xid, valid_yid)
    test_data_path = params.data_dir + '/' + \
        params.data_name + '_test' + str(params.train_set) + '.csv'
    test_s_data, test_sa_data, test_eid, test_tid, test_fid, test_xid, test_yid, test_s_num = dat.load_test_data(
        test_data_path)
    auc = test_one_dataset(params, file_name, test_s_data, test_sa_data,
                           test_eid, test_tid, test_fid, test_xid, test_yid,
                           best_epoch)
    return test_s_num, auc
Beispiel #6
0
def get_auc(fold_num):
    # Parse Arguments
    parser = argparse.ArgumentParser(description='Script to test KT')
    # Basic Parameters
    parser.add_argument('--max_iter', type=int, default=1000,
                        help='number of iterations')
    parser.add_argument('--train_set', type=int, default=fold_num)
    parser.add_argument('--seed', type=int, default=224, help='default seed')

    # Common parameters
    parser.add_argument('--optim', type=str, default='adam',
                        help='Default Optimizer')
    parser.add_argument('--batch_size', type=int,
                        default=24, help='the batch size')
    parser.add_argument('--lr', type=float, default=1e-5,
                        help='learning rate')
    parser.add_argument('--maxgradnorm', type=float,
                        default=-1, help='maximum gradient norm')
    parser.add_argument('--final_fc_dim', type=int, default=512,
                        help='hidden state dim for final fc layer')

    # AKT Specific Parameter
    parser.add_argument('--d_model', type=int, default=256,
                        help='Transformer d_model shape')
    parser.add_argument('--d_ff', type=int, default=1024,
                        help='Transformer d_ff shape')
    parser.add_argument('--dropout', type=float,
                        default=0.05, help='Dropout rate')
    parser.add_argument('--n_block', type=int, default=1,
                        help='number of blocks')
    parser.add_argument('--n_head', type=int, default=8,
                        help='number of heads in multihead attention')
    parser.add_argument('--kq_same', type=int, default=1)

    # AKT-R Specific Parameter
    parser.add_argument('--l2', type=float, default=1e-5, help='l2 penalty for difficulty')

    # Datasets and Model
    parser.add_argument('--model', type=str, default='akt',
                        help="combination of akt(mandatory), e_p_f_a (mandatory) separated by underscore '_'.")
    parser.add_argument('--dataset', type=str, default="assist2009")
    parser.add_argument('--test', type=bool, default=False, help='enable testing')

    params = parser.parse_args()
    dataset = params.dataset

    if dataset in {"assist2009"}:
        params.n_question = 124
        params.batch_size = 24
        params.seqlen = 400
        params.data_dir = 'data/'+dataset
        params.data_name = dataset
        params.n_pid = 19932
        params.n_tid = 8
        params.n_fid = 8
        params.n_sd = 14 #sequence_delay
        params.n_rd = 11 #repeat_delay
        params.n_xid = 816
        params.n_yid = 4

    if dataset in {"assist2017"}:
        params.batch_size = 24
        params.seqlen = 200
        params.data_dir = 'data/'+dataset
        params.data_name = dataset
        params.n_question = 102
        params.n_pid = 0#3162
        params.n_tid = 4
        params.n_fid = 7
        params.n_sd = 18 #sequence_delay
        params.n_rd = 20 #repeat_delay
        params.n_xid = 16
        params.n_yid = 6

    if dataset in {"statics"}:
        params.n_question = 1223
        params.batch_size = 24
        params.seqlen = 200
        params.data_dir = 'data/'+dataset
        params.data_name = dataset
        params.n_pid = 0
        params.n_tid = 8
        params.n_fid = 9
        params.n_sd = 16
        params.n_rd = 17
        params.n_xid = 382
        params.n_yid = 19

    if dataset in {"slepemapy"}:
        params.n_question = 1277
        params.batch_size = 24
        params.seqlen = 200
        params.data_dir = 'data/'+dataset
        params.data_name = dataset
        params.n_pid = 56030
        params.n_tid = 7
        params.n_fid = 5
        params.n_sd = 14
        params.n_rd = 15
        params.n_xid = 21
        params.n_yid = 0#56030

    params.save = params.data_name
    params.load = params.data_name

    # Setup
    dat = DATA(n_question=params.n_question, seqlen=params.seqlen, separate_char=',')

    seedNum = params.seed
    np.random.seed(seedNum)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seedNum)
    np.random.seed(seedNum)
    file_name_identifier = get_file_name_identifier(params)

    ###Train- Test
    d = vars(params)
    for key in d:
        print('\t', key, '\t', d[key])

    #model path
    file_name = ''
    for item_ in file_name_identifier:
        file_name = file_name+item_[0] + str(item_[1])

    train_data_path = params.data_dir + "/" + \
                          params.data_name + "_train"+str(params.train_set)+".csv"
    valid_data_path = params.data_dir + "/" + \
                          params.data_name + "_valid"+str(params.train_set)+".csv"

    train_data = dat.load_data(train_data_path)
    valid_data = dat.load_data(valid_data_path)
    # Train and get the best episode
    best_epoch = train_one_dataset(
            params, file_name, train_data, valid_data)
    test_data_path = params.data_dir + "/" + \
                         params.data_name + "_test"+str(params.train_set)+".csv"
    test_data = dat.load_test_data(test_data_path)
    auc, acc, loss = test_one_dataset(params, file_name, test_data, best_epoch)
    return test_data[-1], auc, acc, loss
Beispiel #7
0
def adjust_param(max_iter, batch_size, seqlen, test_seqlen, min_seqlen, max_seqlen):
    parser = argparse.ArgumentParser(description='Script to test KVMN.')
    parser.add_argument('--gpus', type=str, default='0', help='the gpus will be used, e.g "0,1,2,3"')
    parser.add_argument('--max_iter', type=int, default=max_iter, help='number of iterations')  # default=50
    parser.add_argument('--test', type=bool, default=False, help='enable testing')
    parser.add_argument('--train_test', type=bool, default=True, help='enable testing')
    parser.add_argument('--show', type=bool, default=True, help='print progress')

    dataset = "STATICS"  # assist2009_updated / assist2015 / KDDal0506 / STATICS

    if dataset == "assist2009_updated":
        parser.add_argument('--batch_size', type=int, default=batch_size, help='the batch size')  # 32
        parser.add_argument('--q_embed_dim', type=int, default=50, help='question embedding dimensions')  # 50
        parser.add_argument('--qa_embed_dim', type=int, default=10,
                            help='answer and question embedding dimensions')  # 200
        parser.add_argument('--memory_size', type=int, default=10, help='memory size')

        parser.add_argument('--init_std', type=float, default=0.1, help='weight initialization std')
        parser.add_argument('--init_lr', type=float, default=0.05, help='initial learning rate')
        parser.add_argument('--final_lr', type=float, default=1E-5,
                            help='learning rate will not decrease after hitting this threshold')
        parser.add_argument('--momentum', type=float, default=0.9, help='momentum rate')
        parser.add_argument('--maxgradnorm', type=float, default=50.0, help='maximum gradient norm')
        parser.add_argument('--final_fc_dim', type=float, default=50, help='hidden state dim for final fc layer')

        parser.add_argument('--n_question', type=int, default=110, help='the number of unique questions in the dataset')
        parser.add_argument('--seqlen', type=int, default=seqlen,
                            help='the allowed maximum length of a sequence')  # 200
        parser.add_argument('--data_dir', type=str, default='../../data/assist2009_updated', help='data directory')
        parser.add_argument('--data_name', type=str, default='assist2009_updated', help='data set name')
        parser.add_argument('--load', type=str, default='assist2009_updated', help='model file to load')
        parser.add_argument('--save', type=str, default='assist2009_updated', help='path to save model')
    elif dataset == "assist2015":
        parser.add_argument('--batch_size', type=int, default=batch_size, help='the batch size')
        parser.add_argument('--q_embed_dim', type=int, default=50, help='question embedding dimensions')
        parser.add_argument('--qa_embed_dim', type=int, default=10, help='answer and question embedding dimensions')
        parser.add_argument('--memory_size', type=int, default=10, help='memory size')

        parser.add_argument('--init_std', type=float, default=0.1, help='weight initialization std')
        parser.add_argument('--init_lr', type=float, default=0.1, help='initial learning rate')
        parser.add_argument('--final_lr', type=float, default=1E-5,
                            help='learning rate will not decrease after hitting this threshold')
        parser.add_argument('--momentum', type=float, default=0.9, help='momentum rate')
        parser.add_argument('--maxgradnorm', type=float, default=50.0, help='maximum gradient norm')
        parser.add_argument('--final_fc_dim', type=float, default=50, help='hidden state dim for final fc layer')

        parser.add_argument('--n_question', type=int, default=100, help='the number of unique questions in the dataset')
        parser.add_argument('--seqlen', type=int, default=seqlen, help='the allowed maximum length of a sequence')
        parser.add_argument('--data_dir', type=str, default='../../data/assist2015', help='data directory')
        parser.add_argument('--data_name', type=str, default='assist2015', help='data set name')
        parser.add_argument('--load', type=str, default='assist2015', help='model file to load')
        parser.add_argument('--save', type=str, default='assist2015', help='path to save model')
    elif dataset == "STATICS":
        parser.add_argument('--batch_size', type=int, default=batch_size, help='the batch size')
        parser.add_argument('--q_embed_dim', type=int, default=50, help='question embedding dimensions')
        parser.add_argument('--qa_embed_dim', type=int, default=100, help='answer and question embedding dimensions')
        parser.add_argument('--memory_size', type=int, default=10, help='memory size')

        parser.add_argument('--init_std', type=float, default=0.1, help='weight initialization std')
        parser.add_argument('--init_lr', type=float, default=0.01, help='initial learning rate')
        parser.add_argument('--final_lr', type=float, default=1E-5,
                            help='learning rate will not decrease after hitting this threshold')
        parser.add_argument('--momentum', type=float, default=0.9, help='momentum rate')
        parser.add_argument('--maxgradnorm', type=float, default=50.0, help='maximum gradient norm')
        parser.add_argument('--final_fc_dim', type=float, default=50, help='hidden state dim for final fc layer')

        parser.add_argument('--n_question', type=int, default=1223,
                            help='the number of unique questions in the dataset')
        parser.add_argument('--seqlen', type=int, default=seqlen, help='the allowed maximum length of a sequence')
        parser.add_argument('--data_dir', type=str, default='../../data/STATICS', help='data directory')
        parser.add_argument('--data_name', type=str, default='STATICS', help='data set name')
        parser.add_argument('--load', type=str, default='STATICS', help='model file to load')
        parser.add_argument('--save', type=str, default='STATICS', help='path to save model')

    params = parser.parse_args()
    params.lr = params.init_lr
    params.memory_key_state_dim = params.q_embed_dim
    params.memory_value_state_dim = params.qa_embed_dim

    params.ctx = mx.cpu()

    # test_seqlen = params.seqlen
    #
    # Read data
    train_dat = DATA(n_question=params.n_question, seqlen=params.seqlen, separate_char=',')
    test_dat = DATA(n_question=params.n_question, seqlen=test_seqlen, separate_char=',')
    seedNum = 224
    np.random.seed(seedNum)
    if not params.test:
        params.memory_key_state_dim = params.q_embed_dim
        params.memory_value_state_dim = params.qa_embed_dim
        train_seqlen = params.seqlen
        d = vars(params)

        train_data_path = params.data_dir + "/" + params.data_name + "_sub_train_"+str(min_seqlen)+"_"+str(max_seqlen)+".csv"
        valid_data_path = params.data_dir + "/" + params.data_name + "_sub_valid_"+str(min_seqlen)+"_"+str(max_seqlen)+".csv"
        test_data_path = params.data_dir + "/" + params.data_name + "_test_"+str(min_seqlen)+"_"+str(max_seqlen)+".csv"

        train_u2q_data, train_u2qa_data = train_dat.load_data(train_data_path)
        valid_u2q_data, valid_u2qa_data, valid_u2tf_data = train_dat.load_test_data(valid_data_path, 0.111)  # 0.1/0.9
        test_u2q_data, test_u2qa_data, test_u2tf_data = test_dat.load_test_data(test_data_path, 0.1)

        total_train_valid_acc = 0
        total_train_valid_loss = 0
        total_test_valid_auc = 0
        total_test_valid_acc = 0
        total_test_valid_loss = 0
        user_count = 0
        best_epoch = 30

        all_pred_list = []
        all_target_list = []
        i = 0
        for user_id in train_u2q_data:
            params.seqlen = train_seqlen
            file_name = 'u' + user_id + '_b' + str(params.batch_size) + \
                        '_q' + str(params.q_embed_dim) + '_qa' + str(params.qa_embed_dim) + \
                        '_m' + str(params.memory_size) + '_std' + str(params.init_std) + \
                        '_lr' + str(params.init_lr) + '_gn' + str(params.maxgradnorm) + \
                        '_f' + str(params.final_fc_dim) + '_s' + str(seedNum)
            train_q_data = train_u2q_data[user_id]
            train_qa_data = train_u2qa_data[user_id]
            valid_q_data = valid_u2q_data[user_id]
            valid_qa_data = valid_u2qa_data[user_id]
            valid_tf_data = valid_u2tf_data[user_id]

            train_valid_acc, train_valid_loss = train_one_dataset(params, file_name, train_q_data, train_qa_data,
                                                                  valid_q_data, valid_qa_data, valid_tf_data)

            total_train_valid_acc += train_valid_acc
            total_train_valid_loss += train_valid_loss

            if params.train_test:
                params.seqlen = test_seqlen
                test_q_data = test_u2q_data[user_id]
                test_qa_data = test_u2qa_data[user_id]
                test_tf_data = test_u2tf_data[user_id]

                pred_list, target_list = test_one_dataset(params, file_name, test_q_data, test_qa_data, test_tf_data,
                                                          best_epoch, user_id)
                all_pred_list += pred_list
                all_target_list += target_list
            user_count += 1

        average_train_valid_acc = total_train_valid_acc / user_count
        average_train_valid_loss = total_train_valid_loss / user_count

        # print("average_train_valid_acc: ", average_train_valid_acc)
        # print("average_train_valid_loss: ", average_train_valid_loss)

        all_pred = np.concatenate(all_pred_list, axis=0)
        all_target = np.concatenate(all_target_list, axis=0)
        loss = run.binaryEntropy(all_target, all_pred)
        auc = run.compute_auc(all_target, all_pred)
        acc = run.compute_accuracy(all_target, all_pred)

        # print("valid_auc: ", auc)
        # print("valid_acc: ", acc)
        # print("valid_loss: ", loss)
        return auc
    else:
        params.memory_key_state_dim = params.q_embed_dim
        params.memory_value_state_dim = params.qa_embed_dim
        params.seqlen = test_seqlen
        d = vars(params)

        train_data_path = params.data_dir + "/" + params.data_name + "_sub_train_"+str(min_seqlen)+"_"+str(max_seqlen)+".csv"
        valid_data_path = params.data_dir + "/" + params.data_name + "_sub_valid_"+str(min_seqlen)+"_"+str(max_seqlen)+".csv"
        test_data_path = params.data_dir + "/" + params.data_name + "_test_"+str(min_seqlen)+"_"+str(max_seqlen)+".csv"

        train_u2q_data, train_u2qa_data = train_dat.load_data(train_data_path)
        test_u2q_data, test_u2qa_data, test_u2tf_data = test_dat.load_test_data(test_data_path, 0.1)

        user_count = 0
        best_epoch = 30

        all_pred_list = []
        all_target_list = []
        i = 0
        for user_id in train_u2q_data:
            file_name = params.save + '-dkvmn_initialization'

            test_q_data = test_u2q_data[user_id]
            test_qa_data = test_u2qa_data[user_id]
            test_tf_data = test_u2tf_data[user_id]

            pred_list, target_list = test_one_dataset(params, file_name, test_q_data, test_qa_data, test_tf_data,
                                                      best_epoch, user_id)
            all_pred_list += pred_list
            all_target_list += target_list

            user_count += 1

        all_pred = np.concatenate(all_pred_list, axis=0)
        all_target = np.concatenate(all_target_list, axis=0)
        loss = run.binaryEntropy(all_target, all_pred)
        auc = run.compute_auc(all_target, all_pred)
        acc = run.compute_accuracy(all_target, all_pred)

        # print("valid_auc: ", auc)
        # print("valid_acc: ", acc)
        # print("valid_loss: ", loss)

        return auc
Beispiel #8
0
    params = parser.parse_args()
    params.lr = params.init_lr
    params.memory_key_state_dim = params.q_embed_dim
    params.memory_value_state_dim = params.qa_embed_dim

    params.dataset = dataset
    if params.gpus == None:
        ctx = mx.cpu()
        print "Training with cpu ..."
    else:
        ctx = mx.gpu(int(params.gpus))
        print "Training with gpu(" + params.gpus + ") ..."
    params.ctx = ctx

    # Read data
    dat = DATA(n_question=params.n_question, seqlen=params.seqlen, separate_char=',')

    seedNum =224
    np.random.seed(seedNum)
    if not params.test:
        params.memory_key_state_dim = params.q_embed_dim
        params.memory_value_state_dim = params.qa_embed_dim
        d = vars(params)
        for key in d:
            print '\t', key, '\t', d[key]
        file_name = 'b' + str(params.batch_size) + \
                    '_q' + str(params.q_embed_dim) + '_qa' + str(params.qa_embed_dim) + \
                    '_m' + str(params.memory_size) + '_std' + str(params.init_std) + \
                    '_lr' + str(params.init_lr) + '_gn' + str(params.maxgradnorm) + \
                    '_f' + str(params.final_fc_dim)+'_s'+str(seedNum)
        train_data_path = params.data_dir + "/" + params.data_name + "_train1.csv"