Exemple #1
0
def get_cado_predictions():
    data_path = '../../datasets/cado/train.csv'
    test_path = '../../datasets/cado/test.csv'

    data = du.load_data(data_path)
    test = du.load_data(test_path)

    text_index = 6
    label_start_index = 7
    X = [d[text_index] for d in data]
    labels = [d[label_start_index:label_start_index + 12] for d in data]

    X_test = [d[text_index] for d in test]
    labels_test = [d[label_start_index:label_start_index + 12] for d in test]

    Y = np.array(labels, dtype='int')
    y_test = np.array(labels_test, dtype='int')
    #Y = np.array(binary_labels, dtype='int')

    test_index = len(X)

    X = X + X_test
    Y = np.vstack([Y, y_test])

    tokenizer = tokenize_data(X)
    word_index = tokenizer.word_index

    sequences = tokenizer.texts_to_sequences(X)

    X = pad_sequences(sequences,
                      maxlen=700,
                      padding="post",
                      truncating="post",
                      value=0)

    num_words = min(MAX_NB_WORDS, len(word_index) + 1)
    embedding_matrix = np.zeros((num_words, 1))

    for word, i in word_index.items():
        if i >= MAX_NB_WORDS:
            continue
        embedding_matrix[i] = 1

    X_train = X[0:test_index, :]
    Y_train = Y[0:test_index, :]
    x_test = X[test_index:len(X), :]
    y_test = Y[test_index:len(Y), :]

    classifier = MLkNN()
    classifier.fit(X_train, Y_train)
    predictions = classifier.predict(x_test)
    scores = classifier.predict_proba(x_test)
    y_pred = predictions.toarray()
    y_score = scores.toarray()

    return y_pred, y_score
Exemple #2
0
def _dkt_test_models_multistep_chunk(trainparams, mctsparams, runstartix,
                                     chunk_num_runs):
    '''
    Evaluate multistep error for the chunk of models.
    '''

    ms_losses = [[] for _ in six.moves.range(chunk_num_runs)]

    #load data
    data = dataset_utils.load_data(
        filename='{}{}'.format(dg.SYN_DATA_DIR, mctsparams.mserror_file))

    for offset in six.moves.range(chunk_num_runs):
        r = runstartix + offset
        for ep in trainparams.saved_epochs:
            print('=====================================')
            print('---------- Rep {:2d} Epoch {:2d} ----------'.format(r, ep))
            print('=====================================')

            # load model from checkpoint
            checkpoint_name = trainparams.checkpoint_pat.format(
                trainparams.run_name, r, ep)
            checkpoint_path = '{}/{}'.format(trainparams.dir_name,
                                             checkpoint_name)

            # compute the multistep on the training data
            curr_loss = test_dkt_multistep(trainparams.model_id,
                                           data,
                                           chkpt=checkpoint_path)
            ms_losses[offset].append(curr_loss)

            six.print_(curr_loss)

    return ms_losses
Exemple #3
0
def save_to_latex():
    n=12
    results=[]
    results_path = '../../results/multi-cado/labels_r/'

    classifier_names = ['LSTM1', 'LSTM2' ,'MLkNN', 'MF1', 'MF2','MK3', 'RAND']
    
    
    for l in range(n):
        tmp = []
        for clf in classifier_names:
            data = du.load_data(results_path+clf)
            l_r = data[l]
            tmp.append([clf]+l_r[0:3])
            
        results.append(tmp)
        
    #fmt = "%d, %d, %d, %s"
    #all_results = np.round(all_results, decimals=3)
    # all_results.astype('str')
    tex_out = "" 
    for k in range(n):
        #np.savetxt(, np.array(all_results[k]))
        du.save_data(results[k], '../../results_out/prf'+str(k)+'.csv', header=['classifier', 'precision', 'recall', 'f1-score'])
        if k%2 == 0:
            tex_out += r'''
\begin{table}[!htb]
            '''
        tex_out += r'''
    \begin{minipage}{.5\textwidth}
        \centering
        \caption{Caption '''+str(k)+'''}
        \label{tab:prf_'''+str(k)+'''}
        \pgfplotstabletypeset[col sep=comma,
     	header=true,  
     	precision=4,
        columns/classifier/.style={string type, column type=r, column name=\ },
        columns={classifier, precision, recall, f1-score},
        highlight col max ={prf'''+str(k)+r'''.csv}{precision}, 
        highlight col max ={prf'''+str(k)+r'''.csv}{recall}, 
        highlight col max ={prf'''+str(k)+r'''.csv}{f1-score},  
        every head row/.style={before row=\\\toprule, after row=\bottomrule}, 
        every even row/.style={before row={\rowcolor[gray]{0.92}}},
        every last row/.style={after row=\bottomrule}  
        ]{prf'''+str(k)+'''.csv}
 	\end{minipage}'''
        if k%2 != 0 and k > 0 or k==n-1:
            tex_out += r'''
\end{table}       
            '''

    
    text_file = open("../../results_out/tables.tex", "w")
    text_file.write(tex_out)
    text_file.close()
Exemple #4
0
    auc_results = []
    fr = []
    cfms = {}
    
    java_aucs = []
    net_aucs = []

    for clf_name in classifier_names:
        precision_results[clf_name] = [] 
        recall_results[clf_name] = [] 
        fscore_results[clf_name] = [] 
        amount_results[clf_name] = [] 
        
        cfms[clf_name] = []
    
    d = du.load_data(results_path+'y_test')   
    pr_ids = du.load_data(results_path+'test_pr_ids.csv')   
    t = np.array(d)
    t = t.astype('float')
    
    pr_ids = np.array([int(i[0]) for i in pr_ids])
    
    java = np.array(pr_ids)
    net = np.array(pr_ids)
    java[java==8]=1
    java[java==7]=0
    java=java==1
    
    net[net==8]=0
    net[net==7]=1
    net = net==1    
def main_worker(gpu, args, config, hyper):
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.enabled = True

    best_acc1 = 0
    args.writer = None
    start_epoch = 0
    distributed = args.gpu is None

    if distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method="tcp://10.0.1.164:12345",
                                world_size=args.world_size,
                                rank=gpu)
        print("Process: {}, rank: {}, world_size: {}".format(
            gpu, dist.get_rank(), dist.get_world_size()))

    # Set the default device, any tensors created by cuda by 'default' will use this device
    torch.cuda.set_device(gpu)

    train_loader = load_data(config.train_path, args, hyper, distributed)
    val_loader = load_val(config.val_path, args, hyper, distributed)
    assert train_loader.dataset.classes == val_loader.dataset.classes

    model = resnet18()
    model.cuda(gpu)

    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = optim.SGD(model.parameters(),
                          lr=hyper.base_lr,
                          momentum=hyper.momentum,
                          weight_decay=hyper.weight_decay)

    # Nvidia documentation states -
    # "O2 exists mainly to support some internal use cases. Please prefer O1"
    # https://github.com/NVIDIA/apex/tree/master/examples/imagenet
    model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    if distributed:
        # By default, apex.parallel.DistributedDataParallel overlaps communication
        # with computation in the backward pass.
        # delay_allreduce delays all communication to the end of the backward pass.
        model = apex.parallel.DistributedDataParallel(model)

    if args.resume:
        checkpoint = torch.load(config.checkpoint_file, map_location='cpu')
        best_acc1 = checkpoint['best_acc1']
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        amp.load_state_dict(checkpoint["amp"])
        start_epoch = checkpoint["epoch"]
        del checkpoint
    start_epoch = args.start_epoch - 1 if "start_epoch_overr" in args.__dict__ else start_epoch

    if args.evaluate:
        train_or_eval(False, gpu, val_loader, model, criterion, None, args,
                      hyper, 0)
        return

    if not distributed or gpu == 0:
        args.writer = SummaryWriter(filename_suffix="{}".format(gpu))

    end_epoch = start_epoch + args.epochs
    for epoch in range(start_epoch, end_epoch):
        if distributed:
            train_loader.sampler.set_epoch(epoch)

        train_or_eval(True, gpu, train_loader, model, criterion, optimizer,
                      args, hyper, epoch)

        if not args.prof and (not distributed or gpu == 0):
            acc1 = train_or_eval(False, gpu, val_loader, model, criterion,
                                 None, args, hyper, 0)

            is_best = acc1 > best_acc1
            best_acc1 = max(acc1, best_acc1)

            print("Saving model state...\n")
            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "base_lr": hyper.base_lr,
                    "max_lr": hyper.max_lr,
                    "stepsize": hyper.stepsize,
                    "lr_policy": hyper.lr_policy,
                    "batch_size": hyper.batch_size * args.world_size,
                    "model": model.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "amp": amp.state_dict(),
                    "best_acc1": best_acc1,
                },
                is_best,
                filename=config.checkpoint_write)
    if args.writer:
        args.writer.close()
def main():
    n_concepts = 4
    use_student2 = True
    student2_str = '2' if use_student2 else ''
    learn_prob = 0.5
    lp_str = '-lp{}'.format(int(learn_prob * 100)) if not use_student2 else ''
    n_students = 100000
    seqlen = 7
    filter_mastery = True
    filter_str = '' if not filter_mastery else '-filtered'
    policy = 'random'
    filename = 'test{}-n{}-l{}{}-{}{}.pickle'.format(student2_str, n_students,
                                                     seqlen, lp_str, policy,
                                                     filter_str)
    #concept_tree = sm.create_custom_dependency()
    concept_tree = ConceptDependencyGraph()
    concept_tree.init_default_tree(n_concepts)
    if not use_student2:
        test_student = Student(n=n_concepts,
                               p_trans_satisfied=learn_prob,
                               p_trans_not_satisfied=0.0,
                               p_get_ex_correct_if_concepts_learned=1.0)
    else:
        test_student = Student2(n_concepts)
    print(filename)

    # load toy data
    data = dataset_utils.load_data(
        filename='{}{}'.format(dg.SYN_DATA_DIR, filename))
    print('Average posttest: {}'.format(sm.expected_reward(data)))
    print('Percent of full posttest score: {}'.format(
        sm.percent_complete(data)))
    print('Percent of all seen: {}'.format(sm.percent_all_seen(data)))
    input_data_, output_mask_, target_data_ = dataset_utils.preprocess_data_for_rnn(
        data)

    train_data = (input_data_[:, :, :], output_mask_[:, :, :],
                  target_data_[:, :, :])
    print(input_data_.shape)
    print(output_mask_.shape)
    print(target_data_.shape)

    # test_model hidden=16
    # test_model_mid hidden=10
    # test_model_small hidden=5
    # test_model_tiny hidden=3
    model_id = "test2_model_small"
    dropouts = np.array([1.0])
    n_dropouts = dropouts.shape[0]
    total_epochs = 14
    reps = 20

    class ExtractCallback(tflearn.callbacks.Callback):
        def __init__(self):
            self.tstates = []

        def on_epoch_end(self, training_state):
            self.tstates.append(copy.copy(training_state))

    def test_dropout_losses():
        losses = np.zeros((n_dropouts, reps, total_epochs))
        val_losses = np.zeros((n_dropouts, reps, total_epochs))

        for d in range(n_dropouts):
            dropout = dropouts[d]
            for r in range(reps):
                print('----------------------------------------')
                print('---------- Dropout {:3.1f} Rep {:2d} ----------'.format(
                    dropout, r + 1))
                print('----------------------------------------')
                ecall = ExtractCallback()
                dmodel = dmc.DynamicsModel(model_id=model_id,
                                           timesteps=seqlen,
                                           dropout=dropout,
                                           load_checkpoint=False)
                dmodel.train(train_data,
                             n_epoch=total_epochs,
                             callbacks=ecall,
                             shuffle=False,
                             load_checkpoint=False)
                losses[d,
                       r, :] = np.array([s.global_loss for s in ecall.tstates])
                val_losses[d, r, :] = np.array(
                    [s.val_loss for s in ecall.tstates])

        return losses, val_losses

    losses, val_losses = test_dropout_losses()

    np.savez("dropoutput", dropouts=dropouts, losses=losses, vals=val_losses)
Exemple #7
0
def _dkt_train_models_chunk(params, runstartix, chunk_num_runs):
    '''
    Loads data and trains a batch of models.
    A batch is a continguous sequence of runs
    '''

    #six.print_('startix {} nruns {}'.format(runstartix,chunk_num_runs))

    train_losses = [[] for _ in six.moves.range(chunk_num_runs)]
    val_losses = [[] for _ in six.moves.range(chunk_num_runs)]

    #load data
    data = dataset_utils.load_data(
        filename='{}{}'.format(dg.SYN_DATA_DIR, params.datafile))
    input_data_, output_mask_, target_data_ = dataset_utils.preprocess_data_for_rnn(
        data)

    for offset in six.moves.range(chunk_num_runs):
        r = runstartix + offset

        # new model instantiation
        dkt_model = dmc.DynamicsModel(model_id=params.model_id,
                                      timesteps=params.seqlen - 1,
                                      dropout=params.dropout,
                                      output_dropout=params.output_dropout,
                                      load_checkpoint=False)

        epochs_trained = 0
        for ep in params.saved_epochs:
            print('=====================================')
            print('---------- Rep {:2d} Epoch {:2d} ----------'.format(r, ep))
            print('=====================================')

            # remember the epochs are given as zero-based
            epochs_to_train = ep + 1 - epochs_trained
            assert epochs_to_train > 0

            # train
            ecall = ExtractCallback()

            for _ in six.moves.range(epochs_to_train):
                # add noise every epoch, so the noise is randomly different every epoch
                processed_input_data = input_data_ + (
                    params.noise * np.random.randn(*input_data_.shape))
                train_data = (processed_input_data[:, :, :],
                              output_mask_[:, :, :], target_data_[:, :, :])
                dkt_model.train(train_data,
                                n_epoch=1,
                                callbacks=ecall,
                                shuffle=params.shuffle,
                                load_checkpoint=False)

            # save the checkpoint
            checkpoint_name = params.checkpoint_pat.format(
                params.run_name, r, ep)
            checkpoint_path = '{}/{}'.format(params.dir_name, checkpoint_name)
            dkt_model.save(checkpoint_path)

            # update stats
            train_losses[offset].extend([
                np.mean([ts.global_loss for ts in batch])
                for batch in ecall.tstates
            ])
            val_losses[offset].extend(
                [batch[-1].val_loss for batch in ecall.tstates])

            # update epochs_trained
            epochs_trained = ep + 1
    return (train_losses, val_losses)
Exemple #8
0
def load_toy_data():
    filename = "toy.pickle"
    data = dataset_utils.load_data(
        filename="{}{}".format(SYN_DATA_DIR, filename))
    print("Loaded data. # samples:  {}".format(len(data)))
Exemple #9
0
    if len(sys.argv) > 4:
        test_path = str(sys.argv[4])

    if len(sys.argv) > 5:
        result_path = str(sys.argv[5])

    if len(sys.argv) > 6:
        train_embeddings = bool(str(sys.argv[6]))

    head = [[0, "documentText"], [1, "functionality"], [2, "concept"],
            [3, "directives"], [4, "purpose"], [5, "quality"], [6, "control"],
            [7, "structure"], [8, "patterns"], [9, "codeExamples"],
            [10, "environment"], [11, "reference"], [12, "nonInformation"]]

    data = du.load_data(data_path)
    test = du.load_data(test_path)

    #prid_index = 1
    #text_index = 0
    #label_start_index = 1

    prid_index = 3
    text_index = 6
    label_start_index = 7
    X = [d[text_index] for d in data]
    labels = [d[label_start_index:label_start_index + 12] for d in data]
    pr_ids = np.array([d[prid_index] for d in data])

    prid_index = 3
    text_index = 6