Ejemplos de get_dataset en Python

Lenguaje de programación: Python

Namespace/Package Name: dataprep

Método / Función: get_dataset

Ejemplos en hotexamples.com: 5

Python get_dataset - 5 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de dataprep.get_dataset extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

Archivo: command.py Proyecto: skyornig/ekt

def testseq(model, args):
    try:
        torch.set_grad_enabled(False)
    except AttributeError:
        pass
    logging.info('model: %s, setup: %s' %
                 (type(model).__name__, str(model.args)))
    logging.info('loading dataset')

    data = get_dataset(args.dataset)
    data.random_level = args.random_level

    if not args.dataset.endswith('test'):
        if args.split_method == 'user':
            _, data = data.split_user(args.frac)
            testsets = [('user_split', data, {})]
        elif args.split_method == 'future':
            _, data = data.split_future(args.frac)
            testsets = [('future_split', data, {})]
        elif args.split_method == 'old':
            trainset, _, _, _ = data.split()
            data = trainset.get_seq()
            train, user, exam, new = data.split()
            train = train.get_seq()
            user = user.get_seq()
            exam = exam.get_seq()
            new = new.get_seq()
            testsets = zip(['user', 'exam', 'new'], [user, exam, new],
                           [{}, train, user])
        else:
            if args.ref_set:
                ref = get_dataset(args.ref_set)
                ref.random_level = args.random_level
                testsets = [(args.dataset.split('/')[-1], data.get_seq(),
                             ref.get_seq())]
            else:
                testsets = [('student', data.get_seq(), {})]
    else:
        testsets = [('school', data.get_seq(), {})]

    if args.input_knowledge:
        logging.info('loading knowledge concepts')
        topic_dic = {}
        kcat = Categorical(one_hot=True)
        kcat.load_dict(open(model.args['knows']).read().split('\n'))
        know = 'data/id_firstknow.txt' if 'first' in model.args['knows'] \
            else 'data/id_know.txt'
        for line in open(know):
            uuid, know = line.strip().split(' ')
            know = know.split(',')
            topic_dic[uuid] = torch.LongTensor(kcat.apply(None,
                                                          know)).max(0)[0]
        zero = [0] * len(kcat.apply(None, '<NULL>'))

    if args.input_text:
        logging.info('loading exercise texts')
        topics = get_topics(args.dataset, model.words)

    if args.snapshot is None:
        epoch = load_last_snapshot(model, args.workspace)
    else:
        epoch = args.snapshot
        load_snapshot(model, args.workspace, epoch)
    logging.info('loaded model at epoch %s', str(epoch))

    if use_cuda:
        model.cuda()

    for testset, data, ref_data in testsets:
        logging.info('testing on: %s', testset)
        f = open_result(args.workspace, testset, epoch)

        then = time.time()

        total_mse = 0
        total_mae = 0
        total_acc = 0
        total_seq_cnt = 0

        users = list(data)
        random.shuffle(users)
        seq_cnt = len(users)

        MSE = torch.nn.MSELoss()
        MAE = torch.nn.L1Loss()

        for user in users[:5000]:
            total_seq_cnt += 1

            seq = data[user]
            if user in ref_data:
                ref_seq = ref_data[user]
            else:
                ref_seq = []

            length = len(seq)
            ref_len = len(ref_seq)
            seq = ref_seq + seq

            if ref_len < args.ref_len:
                length = length + ref_len - args.ref_len
                ref_len = args.ref_len

            if length < 1:
                ref_len = ref_len + length - 1
                length = 1

            mse = 0
            mae = 0
            acc = 0

            # seq2 = []
            # seen = set()
            # for item in seq:
            #     if item.topic in seen:
            #         continue
            #     seen.add(item.topic)
            #     seq2.append(item)

            # seq = seq2
            # length = len(seq) - ref_len

            pred_scores = Variable(torch.zeros(len(seq)))

            s = None
            h = None

            for i, item in enumerate(seq):
                # get last record for testing and current record for updating
                if args.input_knowledge:
                    if item.topic in topic_dic:
                        knowledge = topic_dic[item.topic]
                        knowledge_last = topic_dic[seq[-1].topic]
                    else:
                        knowledge = zero
                        knowledge_last = zero
                    knowledge = Variable(torch.LongTensor(knowledge))
                    knowledge_last = Variable(torch.LongTensor(knowledge_last),
                                              volatile=True)

                if args.input_text:
                    text = topics.get(item.topic).content
                    text = Variable(torch.LongTensor(text))
                    text_last = topics.get(seq[-1].topic).content
                    text_last = Variable(torch.LongTensor(text_last),
                                         volatile=True)

                score = Variable(torch.FloatTensor([item.score]),
                                 volatile=True)
                score_last = Variable(torch.FloatTensor([round(seq[-1].score)
                                                         ]),
                                      volatile=True)
                item_time = Variable(torch.FloatTensor([item.time]),
                                     volatile=True)
                time_last = Variable(torch.FloatTensor([seq[-1].time]),
                                     volatile=True)

                # test last score of each seq for seq figure
                if type(model).__name__.startswith('DK'):
                    s, _ = model(knowledge_last, score_last, time_last, h)
                elif type(model).__name__.startswith('RA'):
                    s, _ = model(text_last, score_last, time_last, h)
                elif type(model).__name__.startswith('EK'):
                    s, _ = model(text_last, knowledge_last, score_last,
                                 time_last, h)
                s_last = torch.clamp(s, 0, 1)

                # update student state h until the fit process reaches trainset
                if ref_len > 0 and i > ref_len:
                    if type(model).__name__.startswith('DK'):
                        s, _ = model(knowledge, score, item_time, h)
                    elif type(model).__name__.startswith('RA'):
                        s, _ = model(text, score, item_time, h)
                    elif type(model).__name__.startswith('EK'):
                        s, _ = model(text, knowledge, score, item_time, h)
                else:
                    if type(model).__name__.startswith('DK'):
                        s, h = model(knowledge, score, item_time, h)
                    elif type(model).__name__.startswith('RA'):
                        s, h = model(text, score, item_time, h)
                    elif type(model).__name__.startswith('EK'):
                        s, h = model(text, knowledge, score, item_time, h)

                pred_scores[i] = s_last

                if args.loss == 'cross_entropy':
                    s = F.sigmoid(s)
                else:
                    s = torch.clamp(s, 0, 1)
                if i < ref_len:
                    continue

                mse += MSE(s, score)
                m = MAE(s, score).data[0]
                mae += m
                acc += m < 0.5

            print_seq(seq, pred_scores.data.cpu().numpy(), ref_len, f, True)

            mse /= length
            mae /= length
            acc = float(acc) / length

            total_mse += mse.data[0]
            total_mae += mae
            total_acc += acc

            if total_seq_cnt % args.print_every != 0 and total_seq_cnt != seq_cnt:
                continue

            now = time.time()
            duration = (now - then) / 60

            logging.info(
                '[%d/%d] (%.2f seqs/min) '
                'rmse %.6f, mae %.6f, acc %.6f' %
                (total_seq_cnt, seq_cnt,
                 ((total_seq_cnt - 1) % args.print_every + 1) / duration,
                 math.sqrt(total_mse / total_seq_cnt),
                 total_mae / total_seq_cnt, total_acc / total_seq_cnt))
            then = now
        f.close()

Ejemplo n.º 2

Mostrar archivo

Archivo: command.py Proyecto: skyornig/ekt

def test(model, args):
    try:
        torch.set_grad_enabled(False)
    except AttributeError:
        pass
    logging.info('model: %s, setup: %s' %
                 (type(model).__name__, str(model.args)))
    logging.info('loading dataset')
    data = get_dataset(args.dataset)
    data.random_level = args.random_level

    if not args.dataset.endswith('test'):
        if args.split_method == 'user':
            _, data = data.split_user(args.frac)
            testsets = [('user_split', data, {})]
        elif args.split_method == 'future':
            _, data = data.split_future(args.frac)
            testsets = [('future_split', data, {})]
        elif args.split_method == 'old':
            trainset, _, _, _ = data.split()
            data = trainset.get_seq()
            train, user, exam, new = data.split()
            train = train.get_seq()
            user = user.get_seq()
            exam = exam.get_seq()
            new = new.get_seq()
            testsets = zip(['user', 'exam', 'new'], [user, exam, new],
                           [{}, train, user])
        else:
            if args.ref_set:
                ref = get_dataset(args.ref_set)
                ref.random_level = args.random_level
                testsets = [(args.dataset.split('/')[-1], data.get_seq(),
                             ref.get_seq())]
            else:
                testsets = [('student', data.get_seq(), {})]
    else:
        testsets = [('school', data.get_seq(), {})]

    if type(model).__name__.startswith('DK'):
        topic_dic = {}
        kcat = Categorical(one_hot=True)
        kcat.load_dict(open('data/know_list.txt').read().split('\n'))
        for line in open('data/id_know.txt'):
            uuid, know = line.strip().split(' ')
            know = know.split(',')
            topic_dic[uuid] = \
                torch.LongTensor(kcat.apply(None, know)) \
                .max(0)[0] \
                .type(torch.LongTensor)
        zero = [0] * len(kcat.apply(None, '<NULL>'))
    else:
        topics = get_topics(args.dataset, model.words)

    if args.snapshot is None:
        epoch = load_last_snapshot(model, args.workspace)
    else:
        epoch = args.snapshot
        load_snapshot(model, args.workspace, epoch)
    logging.info('loaded model at epoch %s', str(epoch))

    if use_cuda:
        model.cuda()

    for testset, data, ref_data in testsets:
        logging.info('testing on: %s', testset)
        f = open_result(args.workspace, testset, epoch)

        then = time.time()

        total_mse = 0
        total_mae = 0
        total_acc = 0
        total_seq_cnt = 0

        users = list(data)
        random.shuffle(users)
        seq_cnt = len(users)

        MSE = torch.nn.MSELoss()
        MAE = torch.nn.L1Loss()

        for user in users[:5000]:
            seq = data[user]
            if user in ref_data:
                ref_seq = ref_data[user]
            else:
                ref_seq = []

            seq2 = []
            seen = set()
            for item in ref_seq:
                if item.topic in seen:
                    continue
                seen.add(item.topic)
                seq2.append(item)
            ref_seq = seq2

            seq2 = []
            for item in seq:
                if item.topic in seen:
                    continue
                seen.add(item.topic)
                seq2.append(item)
            seq = seq2

            ref_len = len(ref_seq)
            seq = ref_seq + seq
            length = len(seq)

            if ref_len < args.ref_len:
                length = length + ref_len - args.ref_len
                ref_len = args.ref_len

            if length < 1:
                continue
            total_seq_cnt += 1

            mse = 0
            mae = 0
            acc = 0

            pred_scores = Variable(torch.zeros(len(seq)))

            s = None
            h = None

            for i, item in enumerate(seq):
                if args.test_on_last:
                    x = topics.get(seq[-1].topic).content
                    x = Variable(torch.LongTensor(x), volatile=True)
                    score = Variable(torch.FloatTensor([round(seq[-1].score)]),
                                     volatile=True)
                    t = Variable(torch.FloatTensor([seq[-1].time]),
                                 volatile=True)
                    s, _ = model(x, score, t, h)
                    s_last = torch.clamp(s, 0, 1)
                if type(model).__name__.startswith('DK'):
                    if item.topic in topic_dic:
                        x = topic_dic[item.topic]
                    else:
                        x = zero
                else:
                    x = topics.get(item.topic).content
                x = Variable(torch.LongTensor(x))
                score = Variable(torch.FloatTensor([round(item.score)]),
                                 volatile=True)
                t = Variable(torch.FloatTensor([item.time]), volatile=True)
                if args.test_as_seq and i > ref_len and ref_len > 0:
                    s, h = model(x, s.view(1), t, h)
                else:
                    if ref_len > 0 and i > ref_len and not args.test_on_one:
                        s, _ = model(x, score, t, h)
                    else:
                        s, h = model(x, score, t, h)
                if args.loss == 'cross_entropy':
                    s = F.sigmoid(s)
                else:
                    s = torch.clamp(s, 0, 1)
                if args.test_on_last:
                    pred_scores[i] = s_last
                else:
                    pred_scores[i] = s
                if i < ref_len:
                    continue
                mse += MSE(s, score)
                m = MAE(s, score).data[0]
                mae += m
                acc += m < 0.5

            print_seq(seq,
                      pred_scores.data.cpu().numpy(), ref_len, f,
                      args.test_on_last)

            mse /= length
            mae /= length
            acc /= length

            total_mse += mse.data[0]
            total_mae += mae
            total_acc += acc

            if total_seq_cnt % args.print_every != 0 and \
                    total_seq_cnt != seq_cnt:
                continue

            now = time.time()
            duration = (now - then) / 60

            logging.info(
                '[%d/%d] (%.2f seqs/min) '
                'rmse %.6f, mae %.6f, acc %.6f' %
                (total_seq_cnt, seq_cnt,
                 ((total_seq_cnt - 1) % args.print_every + 1) / duration,
                 math.sqrt(total_mse / total_seq_cnt),
                 total_mae / total_seq_cnt, total_acc / total_seq_cnt))
            then = now

        f.close()

Ejemplo n.º 3

Mostrar archivo

Archivo: command.py Proyecto: skyornig/ekt

def trainn(model, args):
    logging.info('model: %s, setup: %s' %
                 (type(model).__name__, str(model.args)))
    logging.info('loading dataset')
    data = get_dataset(args.dataset)
    data.random_level = args.random_level

    if args.split_method == 'user':
        data, _ = data.split_user(args.frac)
    elif args.split_method == 'future':
        data, _ = data.split_future(args.frac)
    elif args.split_method == 'old':
        data, _, _, _ = data.split()

    data = data.get_seq()

    if args.input_knowledge:
        logging.info('loading knowledge concepts')
        topic_dic = {}
        kcat = Categorical(one_hot=True)
        kcat.load_dict(open(model.args['knows']).read().split('\n'))
        know = 'data/id_firstknow.txt' if 'first' in model.args['knows'] \
            else 'data/id_know.txt'
        for line in open(know):
            uuid, know = line.strip().split(' ')
            know = know.split(',')
            topic_dic[uuid] = torch.LongTensor(kcat.apply(None,
                                                          know)).max(0)[0]
        zero = [0] * len(kcat.apply(None, '<NULL>'))

    if args.input_text:
        logging.info('loading exercise texts')
        topics = get_topics(args.dataset, model.words)

    optimizer = torch.optim.Adam(model.parameters())

    start_epoch = load_last_snapshot(model, args.workspace)
    if use_cuda:
        model.cuda()

    for epoch in range(start_epoch, args.epochs):
        logging.info('epoch {}:'.format(epoch))
        then = time.time()

        total_loss = 0
        total_mae = 0
        total_acc = 0
        total_seq_cnt = 0

        users = list(data)
        random.shuffle(users)
        seq_cnt = len(users)

        MSE = torch.nn.MSELoss()
        MAE = torch.nn.L1Loss()

        for user in users:
            total_seq_cnt += 1

            seq = data[user]
            seq_length = len(seq)

            optimizer.zero_grad()

            loss = 0
            mae = 0
            acc = 0

            h = None

            for i, item in enumerate(seq):
                # score = round(item.score)
                if args.input_knowledge:
                    if item.topic in topic_dic:
                        knowledge = topic_dic[item.topic]
                    else:
                        knowledge = zero
                    # knowledge = torch.LongTensor(knowledge).view(-1).type(torch.FloatTensor)
                    # one_index = torch.nonzero(knowledge).view(-1)
                    # expand_vec = torch.zeros(knowledge.size()).view(-1)
                    # expand_vec[one_index] = score
                    # cks = torch.cat([knowledge, expand_vec]).view(1, -1)
                    knowledge = Variable(torch.LongTensor(knowledge))
                    # cks = Variable(cks)

                if args.input_text:
                    text = topics.get(item.topic).content
                    text = Variable(torch.LongTensor(text))
                score = Variable(torch.FloatTensor([item.score]))
                item_time = Variable(torch.FloatTensor([item.time]))

                if type(model).__name__.startswith('DK'):
                    s, h = model(knowledge, score, item_time, h)
                elif type(model).__name__.startswith('RA'):
                    s, h = model(text, score, item_time, h)
                elif type(model).__name__.startswith('EK'):
                    s, h = model(text, knowledge, score, item_time, h)

                s = s[0]

                if args.loss == 'cross_entropy':
                    loss += F.binary_cross_entropy_with_logits(
                        s, score.view_as(s))
                    m = MAE(F.sigmoid(s), score).data[0]
                else:
                    loss += MSE(s, score)
                    m = MAE(s, score).data[0]
                mae += m
                acc += m < 0.5

            loss /= seq_length
            mae /= seq_length
            acc = float(acc) / seq_length

            total_loss += loss.data[0]
            total_mae += mae
            total_acc += acc

            loss.backward()
            optimizer.step()

            if total_seq_cnt % args.save_every == 0:
                save_snapshot(model, args.workspace,
                              '%d.%d' % (epoch, total_seq_cnt))

            if total_seq_cnt % args.print_every != 0 and total_seq_cnt != seq_cnt:
                continue

            now = time.time()
            duration = (now - then) / 60

            logging.info(
                '[%d:%d/%d] (%.2f seqs/min) loss %.6f, mae %.6f, acc %.6f' %
                (epoch, total_seq_cnt, seq_cnt,
                 ((total_seq_cnt - 1) % args.print_every + 1) / duration,
                 total_loss / total_seq_cnt, total_mae / total_seq_cnt,
                 total_acc / total_seq_cnt))
            then = now

        save_snapshot(model, args.workspace, epoch + 1)

Ejemplo n.º 4

Mostrar archivo

Archivo: command.py Proyecto: skyornig/ekt

def train(model, args):
    logging.info('args: %s' % str(args))
    logging.info('model: %s, setup: %s' %
                 (type(model).__name__, str(model.args)))
    logging.info('loading dataset')
    data = get_dataset(args.dataset)
    data.random_level = args.random_level

    if args.split_method == 'user':
        data, _ = data.split_user(args.frac)
    elif args.split_method == 'future':
        data, _ = data.split_future(args.frac)
    elif args.split_method == 'old':
        data, _, _, _ = data.split()

    data = data.get_seq()

    if type(model).__name__.startswith('DK'):
        topic_dic = {}
        kcat = Categorical(one_hot=True)
        kcat.load_dict(open('data/know_list.txt').read().split('\n'))
        for line in open('data/id_know.txt'):
            uuid, know = line.strip().split(' ')
            know = know.split(',')
            topic_dic[uuid] = \
                torch.LongTensor(kcat.apply(None, know)) \
                .max(0)[0] \
                .type(torch.LongTensor)
        zero = [0] * len(kcat.apply(None, '<NULL>'))
    else:
        topics = get_topics(args.dataset, model.words)

    optimizer = torch.optim.Adam(model.parameters())

    start_epoch = load_last_snapshot(model, args.workspace)
    if use_cuda:
        model.cuda()

    for epoch in range(start_epoch, args.epochs):
        logging.info(('epoch {}:'.format(epoch)))
        then = time.time()

        total_loss = 0
        total_mae = 0
        total_acc = 0
        total_seq_cnt = 0

        users = list(data)
        random.shuffle(users)
        seq_cnt = len(users)

        MSE = torch.nn.MSELoss()
        MAE = torch.nn.L1Loss()

        for user in users:
            total_seq_cnt += 1

            seq = data[user]
            length = len(seq)

            optimizer.zero_grad()

            loss = 0
            mae = 0
            acc = 0

            h = None

            for i, item in enumerate(seq):
                if type(model).__name__.startswith('DK'):
                    if item.topic in topic_dic:
                        x = topic_dic[item.topic]
                    else:
                        x = zero
                else:
                    x = topics.get(item.topic).content
                x = Variable(torch.LongTensor(x))
                # print(x.size())
                score = Variable(torch.FloatTensor([round(item.score)]))
                t = Variable(torch.FloatTensor([item.time]))
                s, h = model(x, score, t, h)
                if args.loss == 'cross_entropy':
                    loss += F.binary_cross_entropy_with_logits(
                        s, score.view_as(s))
                    m = MAE(F.sigmoid(s), score).data[0]
                else:
                    loss += MSE(s, score)
                    m = MAE(s, score).data[0]
                mae += m
                acc += m < 0.5

            loss /= length
            mae /= length
            acc /= length

            total_loss += loss.data[0]
            total_mae += mae
            total_acc += acc

            loss.backward()
            optimizer.step()

            if total_seq_cnt % args.save_every == 0:
                save_snapshot(model, args.workspace,
                              '%d.%d' % (epoch, total_seq_cnt))

            if total_seq_cnt % args.print_every != 0 and \
                    total_seq_cnt != seq_cnt:
                continue

            now = time.time()
            duration = (now - then) / 60

            logging.info(
                '[%d:%d/%d] (%.2f seqs/min) '
                'loss %.6f, mae %.6f, acc %.6f' %
                (epoch, total_seq_cnt, seq_cnt,
                 ((total_seq_cnt - 1) % args.print_every + 1) / duration,
                 total_loss / total_seq_cnt, total_mae / total_seq_cnt,
                 total_acc / total_seq_cnt))
            then = now

        save_snapshot(model, args.workspace, epoch + 1)

Ejemplo n.º 5

Mostrar archivo

from dataprep import Dataset, get_dataset, Record
from random import sample

full = get_dataset('full')
full.random_level = 0
seq = full.get_seq()

for frac in [60, 70, 80, 90]:
    train = get_dataset('data/raw50/future.train.%d' % frac)
    train_topics = set(train.topics)
    test = Dataset()
    cold_test = Dataset()
    train_seq = train.get_seq()
    for user in train_seq:
        L = int(len(train_seq[user]) / frac * (100 - frac))
        topics = set(x.topic for x in seq[user])
        same = topics & train_topics - set(x.topic for x in train_seq[user])
        diff = topics - train_topics
        pop = same | diff
        print(len(same), len(diff))
        L = max(5, L - len(diff))
        selected = set(sample(list(same), L)) | diff

        for topic, score, time, _ in seq[user]:
            r = Record(user, full.user_school_map[user],
                       topic, full.topic_exam_map[topic],
                       score, time)
            if topic in diff:
                cold_test._insert(r)
            if topic in selected:
                test._insert(r)