Ejemplo n.º 1
0
def test_mini_batch():
    # Small input
    output = torch.FloatTensor([[0.1, 0.0], [0.1, 0.0]])
    target = torch.LongTensor([0, 0])
    assert accuracy(output, target)[0].cpu().numpy() == 100.0

    # A bit larger input
    output = torch.FloatTensor([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
                                [0.1, 0.7, 0.3, 0.4, 0.5, 0.6],
                                [0.1, 0.2, 0.8, 0.4, 0.5, 0.6],
                                [0.1, 0.2, 0.3, 0.9, 0.5, 0.6],
                                [0.1, 0.2, 0.3, 0.4, 1.0, 0.6],
                                [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]])
    target = torch.LongTensor([5, 1, 2, 3, 4, 5])
    assert accuracy(output, target)[0].cpu().numpy() == 100.0

    # A bit larger input - with not 100%
    output = torch.FloatTensor([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
                                [0.1, 0.7, 0.3, 0.4, 0.5, 0.6],
                                [0.1, 0.2, 0.8, 0.4, 0.5, 0.6],
                                [0.1, 0.2, 0.3, 0.9, 0.5, 0.6],
                                [0.1, 0.2, 0.3, 0.4, 1.0, 0.6],
                                [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]])
    target = torch.LongTensor([1, 1, 1, 1, 1, 1])
    np.testing.assert_almost_equal(
        accuracy(output, target)[0].cpu().numpy(), 100 / 6.0)
Ejemplo n.º 2
0
def test_no_batch():
    # Sanity check
    output = torch.FloatTensor([0.0, 0.0]).unsqueeze(0)
    target = torch.LongTensor([0])
    assert accuracy(output, target)[0].cpu().numpy() == 0.0

    output = torch.FloatTensor([0.0, 1.0]).unsqueeze(0)
    target = torch.LongTensor([1])
    assert accuracy(output, target)[0].cpu().numpy() == 100.0

    output = torch.FloatTensor([0.2, 0.5, 0.7]).unsqueeze(0)
    target = torch.LongTensor([2])
    assert accuracy(output, target)[0].cpu().numpy() == 100.0
Ejemplo n.º 3
0
    def validate_sentences(self, data, sent_sampler, model, transformation):
        model.eval()
        x, mask, y = data
        # x, mask, y = sent_sampler.get_test()
        true_y = np.zeros(shape=(len(y), len(sent_sampler.unique_labels)),
                          dtype=np.int32)
        for idx, current_y in enumerate(y):
            true_y[idx, current_y] = 1

        x, mask, y = model.prepare_data_for_classifier(x, mask, y,
                                                       transformation)

        if model.is_cuda:
            x = x.cuda()
            y = y.cuda()
            mask = mask.cuda()

        loss = model.classifier.get_loss(x, mask, y).data.cpu().numpy()
        probs = model.classifier(x, mask)[1].data.cpu().numpy()

        pred = np.argmax(probs, axis=1)

        acc = evaluation.accuracy(predicted_probs=probs, true_y=true_y)
        prec = {}
        rec = {}
        for cls in range(true_y.shape[1]):
            prec[cls] = evaluation.precision_by_class(probs, true_y, cls)
            rec[cls] = evaluation.recall_by_class(probs, true_y, cls)

        return acc, prec, rec, loss, evaluation.build_confusion_matrix(
            probs, true_y)
Ejemplo n.º 4
0
def main():
    features = [
        'EMA10',
        'EMA12',
        'EMA20',
        'EMA26',
        'EMA50',
        'EMA100',
        'EMA200',
        'SMA5',
        'SMA10',
        'SMA15',
        'SMA20',
        'SMA50',
        'SMA100',
        'SMA200',
    ]
    label = ['Class']

    df_train, df_test = Load_data()
    Xtrain = df_train[features].values
    Ytrain = df_train[label].values.ravel()
    Xtest = df_test[features].values
    Ytest = df_test[label].values.ravel()

    svm_poly = SVMModel(2)
    svm_poly.train(Xtrain, Ytrain)
    Ypredicted = np.array(svm_poly.predict(Xtest))
    print(eval.accuracy(prediction=Ypredicted, true_class=Ytest))
    print(Ypredicted)
Ejemplo n.º 5
0
def test(test_model, test_data, test_labels, show_mistake=False):
    test_predictions = test_model.predict(test_data, verbose=0)

    # PRINT WRONG PREDICTIONS
    if show_mistake:
        for i in range(len(test_predictions)):
            stress_probability = test_predictions[i][1]
            score = abs(test_labels[i][1] - stress_probability)
            if score > 0:
                seq = ""
                for j in range(len(test_data[i])):
                    seq += idx_to_word[test_data[i][j]].strip() + " "
                print(seq, ",", score, ",", test_labels[i][1],
                      stress_probability)

    # TEST PERFORMANCE
    res_accu = eval.accuracy(test_predictions, test_labels)
    res_f1 = eval.fscore(test_predictions, test_labels)
    res_recall = eval.recall(test_predictions, test_labels)
    res_precision = eval.precision(test_predictions, test_labels)
    print('Test Accuracy: %.3f' % res_accu)
    print('Test F1-score: %.3f' % res_f1)
    print('Test Recall: %.3f' % res_recall)
    print('Test Precision: %.3f' % res_precision)

    return res_accu, res_f1, res_recall, res_precision
Ejemplo n.º 6
0
def online_evaluate(gtmat, pred):
    pred_labels = torch.argmax(pred.cpu(), dim=1).long()
    gt_labels = gtmat.view(-1).cpu().numpy()
    pred_labels = pred_labels.numpy()
    acc = accuracy(gt_labels, pred_labels)
    pre = precision(gt_labels, pred_labels)
    rec = recall(gt_labels, pred_labels)
    return acc, pre, rec
Ejemplo n.º 7
0
def train(train_loader, model, criterion, optimizer, epoch, print_freq,
          summary_writer):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.data[0], input.size(0))
        top1.update(prec1[0], input.size(0))
        top5.update(prec5[0], input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                      epoch,
                      i,
                      len(train_loader),
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=losses,
                      top1=top1,
                      top5=top5))

    summary_writer.add_scalar('data/losses_avg', losses.avg, epoch)
    summary_writer.add_scalar('data/top1_avg', top1.avg, epoch)
    summary_writer.add_scalar('data/top5_avg', top5.avg, epoch)
Ejemplo n.º 8
0
def ELM(numberofHiddenNeurons,
        train,
        test,
        ActivationFunction,
        baseclasser=False):

    if baseclasser == False:
        trainStr = ELMDataStruct(train)
        testStr = ELMDataStruct(test)
    else:
        trainStr = train
        testStr = test
    #(trainStr.labelsMatrix)
    #print(testStr.labelsMatrix)

    beginTrainTime = time()
    inputWeight = random.random(size=(numberofHiddenNeurons,
                                      trainStr.numOfFeature)) * 2 - 1
    biasOfHiddenNeurons = random.random(size=(numberofHiddenNeurons, 1))
    tempH = inputWeight * trainStr.X.T
    biasMatrix = tile(biasOfHiddenNeurons, (1, trainStr.numOfData))
    tempH = tempH + biasMatrix
    #到tempH为止size是(隐含层节点数,样本数)
    print('ActivationFunction:', ActivationFunction)
    if ActivationFunction == 'rbf':
        H = getRBF(trainStr.X, inputWeight, biasOfHiddenNeurons,
                   numberofHiddenNeurons)
    else:
        H = getH(tempH, ActivationFunction)
    outputWeight = (linalg.pinv(H.T) * trainStr.labelsMatrix.T)
    #outputWeight的尺寸:(NumberofHiddenNeurons,numOfClass)
    endTrainTime = time()
    trainTime = endTrainTime - beginTrainTime

    tempTest = inputWeight * testStr.X.T
    biasMatrixTest = tile(biasOfHiddenNeurons, (1, testStr.numOfData))
    tempTest = tempTest + biasMatrixTest
    if ActivationFunction == 'rbf':
        H_test = getRBF(testStr.X, inputWeight, biasOfHiddenNeurons,
                        numberofHiddenNeurons)
    else:
        H_test = getH(tempTest, ActivationFunction)
    Y = (H_test.T * outputWeight).A
    answer = ones((testStr.numOfData, 1))
    for k in range(testStr.numOfData):
        answer[k, 0] = (Y[k, :].tolist().index(max(Y[k, :]))) + 1

    acc = accuracy(answer, testStr.y)
    print('trainTime:', trainTime)
    gmean, Rn = G_mean(answer, testStr.y, testStr.numOfClass)
    if baseclasser == True:
        return answer
    else:
        return acc, gmean, Rn, trainTime
Ejemplo n.º 9
0
 def eval_test(self, X, y):
     
     out = self.sess.run(self.model.out,
                                       feed_dict = {self.model.X : X})
                                        
     acc = evaluation.accuracy(out, y) 
             
     print 'Test accuracy:' , acc
     
     prec, rec, f1 = evaluation.prec_rec(out, y) 
     print 'prec:' , prec, 'rec:', rec, 'f1:',f1
Ejemplo n.º 10
0
def evaluation_index(y_pred, y_tar):
    y_pred_cat = y_pred[0].reshape(-1)
    y_target_cat = y_tar[0].reshape(-1)
    for j in range(1, len(y_pred)):
        y_pred_cat = np.concatenate((y_pred_cat, y_pred[j].reshape(-1)))
        y_target_cat = np.concatenate((y_target_cat, y_tar[j].reshape(-1)))

    rmse = compute_RMSE(y_pred_cat, y_target_cat)
    r = correlation_coefficient(y_pred_cat, y_target_cat)
    accuracy_rate = accuracy(y_pred_cat, y_target_cat)

    return rmse, r, accuracy_rate
Ejemplo n.º 11
0
def evaluate_diso(y_label, y_conv, output, sess):
    y_placeholder = tf.placeholder(tf.float32, shape=[None, 2])
    y_conv_placeholder = tf.placeholder(tf.float32, shape=[None, 2])
    result = sess.run([evaluation.loss_cross_entropy(y_placeholder, y_conv_placeholder),
                       evaluation.accuracy(y_placeholder, y_conv_placeholder)],
                      feed_dict={y_placeholder: y_label,
                                 y_conv_placeholder: y_conv})
    # print("\t Entropy=%g, Accuracy=%g" % (result[0], result[1]))
    results = [str(res) for res in result]
    if output != None:
        output.write('\t'.join(results) + '\n')
    return result, result[1]
Ejemplo n.º 12
0
def runExperiment(dataPath, resultPath):
    epsilons = [0.001,0.005, 0.01, 0.05, 0.1,0.5]
    fairMeasureCodes = ['RD', 'RR', 'RC']
    i=1
    text = ''
    while i<=3:
        print (i)
        text+='dataset No.'+str(i)+'\n' 
        text+='---------------------------'+'\n'
        text+='---------------------------'+'\n'
        rules, hard_rules, counts, atoms = ground(dataPath+str(i)+'/')
        for code in fairMeasureCodes:
            print(code)
            results = map_inference(rules, hard_rules)
            accuracyScore = accuracy(dataPath+str(i)+'/', results, atoms)
            score = evaluate(results, counts, code)
            
            text+='----------'+code+'---------------'+'\n'
            text+='----------PSL--------------'+'\n'
            line = ''
            for epsilon in epsilons:
                text+=str(score)+'\t'
                line+=str(accuracyScore)+'\t'
            
            text+='\n'+line+'\n'+'----------FairPSL----------'+'\n'
            line = ''
            for epsilon in epsilons:
                print(epsilon)
                results = fair_map_inference(rules, hard_rules, counts, epsilon,code)
                accuracyScore = accuracy(dataPath+str(i)+'/', results, atoms)
                line+=str(accuracyScore)+'\t'
                score = evaluate(results, counts,code)
                text+=str(score)+'\t'
            text+='\n'
            text+=line+'\n'
        text+='---------------------------'+'\n'
        text+='---------------------------'+'\n'
        i+=1 
    with open(resultPath, 'w') as f:
        print(text, file=f) 
Ejemplo n.º 13
0
def main(args, config):
    start_time = time.time()
    # 1) Read data from database
    print(20*'=')
    print('1. Downloading data...')
    data = download_data(user=args.user, password=args.password,tb_name='sketch.train_data_2')
    oh = download_data(user=args.user, password=args.password,tb_name='sketch.ode_school')
    # 2) Data preparation
    print(20*'=')
    print('2. Data processing...')
    data = data_preparation(data, oh) # TODO: args can be the filter of which vars to use
    train_data, test_data = train_val_test_split(data, config['min_train_cohort'], config['min_test_cohort'])

    # 3) Model
    print(20*'=')
    print('3. Training model...')
    model = model_dict[config['model']]
    clf = model(train_data, args=config['hyperparameters'])

    # 4) Compute metric in validation set
    print(20*'=')
    print('4. Evaluation model in validation set...')
    metric = metric_dict[config['metric']](clf, test_data)
    print('{}: {}', config['metric'], metric)

    # We print test and train accuracy
    train_accuracy = accuracy(clf, train_data)
    print('Train accuracy: ', train_accuracy)
    test_accuracy = accuracy(clf, test_data)
    print('Test accuracy: ', test_accuracy)

    # 5) Upload result to postgres
    print(20*'=')
    print('5. Uploading result to database...')
    upload_result(config['model_name'], config['metric'], metric, args.user, args.password)

    print(20*'=')
    print('Finished in {} seconds'.format(time.time()-start_time))
Ejemplo n.º 14
0
def bagging_ELM(name,
                numberofHiddenNeurons,
                Type='W1',
                C=64,
                ActivationFunction='sig'):
    train, test = loadData(name)
    shapeOfAnswer = []
    numOfBaseClasser = 10
    trainStr = ELMDataStruct(train)
    testStr = ELMDataStruct(test)
    beginTrainTime = time()
    for i in range(numOfBaseClasser):
        print('Begin %d th train' % (i + 1))
        baggingTrain = dataBagging(trainStr)
        baggingTrainStr = ELMDataStruct(baggingTrain)
        answer = WELM(numberofHiddenNeurons,
                      baggingTrainStr,
                      testStr,
                      Type,
                      ActivationFunction,
                      C,
                      baseclasser=True)
        if i == 0:
            answerMatrix = answer
            shapeOfAnswer = shape(answer)
        else:
            answerMatrix = column_stack((answerMatrix, answer))
    outputAnswer = zeros((shapeOfAnswer))
    endTrainTime = time()
    trainTime = endTrainTime - beginTrainTime

    #matrix2CSV_Once(answerMatrix,[])
    for j in range(shapeOfAnswer[0]):
        voteAnswer = 1
        maxVoteNum = 0
        for k in range(trainStr.numOfClass):
            voteNum = sum(answerMatrix[j, :] == (k + 1))
            if voteNum > maxVoteNum:
                maxVoteNum = voteNum
                voteAnswer = k + 1
        outputAnswer[j] = voteAnswer
    #print(outputAnswer)
    #input()
    acc = accuracy(answer, testStr.y)
    print('-' * 20, 'Bagging result', '-' * 20)
    print('Bagging trainTime:', trainTime)
    gmean, Rn = G_mean(answer, testStr.y, testStr.numOfClass)
    print('-' * 20, 'Bagging result', '-' * 20)
    return acc, gmean, Rn, trainTime
Ejemplo n.º 15
0
def batch_processor(model, data, train_mode):
    assert train_mode
    pred, loss = model(data, return_loss=True)

    log_vars = OrderedDict()
    log_vars['loss'] = loss.item()
    _, _, gt_labels = data
    # TODO: remove pad_label when computing batch accuracy
    pred_labels = torch.argmax(pred.cpu(), dim=1).long()
    gt_labels = gt_labels.cpu().numpy()
    pred_labels = pred_labels.numpy()
    log_vars['acc'] = accuracy(gt_labels, pred_labels)

    outputs = dict(loss=loss, log_vars=log_vars, num_samples=len(data[-1]))

    return outputs
Ejemplo n.º 16
0
def task_3_logistic(x, y, x_test, y_test, args):
    accuracies = []
    sizes = np.linspace(10, 200, num=20)
    N = y.shape[0]
    for size in sizes:
        acc = 0
        for i in range(50):

            rand = np.random.randint(int(N), size=int(size))
            m = LogisticRegression(x[rand], y[rand])
            m.fit(lr=args[0], eps=args[1], regularization=args[2])
            pred = m.predict(x_test)
            cm = evaluation.confusion_matrix(y_test, pred)
            acc += evaluation.accuracy(cm)

        accuracies.append(acc/50)

    return accuracies, sizes
Ejemplo n.º 17
0
def evaluate_link(class_match_set, class_nonmatch_set, true_match_set,
                  all_comparisons):
    # Linkage evaluation
    linkage_result = evaluation.confusion_matrix(class_match_set,
                                                 class_nonmatch_set,
                                                 true_match_set,
                                                 all_comparisons)

    accuracy = evaluation.accuracy(linkage_result)
    precision = evaluation.precision(linkage_result)
    recall = evaluation.recall(linkage_result)
    fmeasure = evaluation.fmeasure(linkage_result)

    print('Linkage evaluation:')
    print('  Accuracy:    %.6f' % (accuracy))
    print('  Precision:   %.6f' % (precision))
    print('  Recall:      %.6f' % (recall))
    print('  F-measure:   %.6f' % (fmeasure))
    print('')
Ejemplo n.º 18
0
def task_3_naive(df, test_df, label, cont=[], cat=[], bin=[]):
    accuracies = []
    sizes = np.linspace(10, 200, num=20)
    N = df.shape[0]
    for size in sizes:
        acc = 0
        for i in range(25):
            print(size, i)
            rand = np.random.randint(int(N), size=int(size))
            m = NaiveBayes(df.loc[rand], label, continuous=cont, categorical=cat, binary=bin)
            pred = test_df.apply(m.predict, axis=1)

            cm = evaluation.confusion_matrix(test_df[label].to_numpy(), pred.to_numpy())
            acc += evaluation.accuracy(cm)

        accuracies.append(acc/25)



    return accuracies, sizes
Ejemplo n.º 19
0
    def validation_step(self, batch, batch_idx):
        metric_dict = {
            "u": {
                "loss": 0,
                "acc": 0,
                "f1": 0
            },
            "test": {
                "loss": 0,
                "acc": 0,
                "f1": 0
            },
        }

        # Loop through unlabelled and test loaders to calculate metrics #
        for key, data in batch.items():
            x, y = data
            logits, y_pred, _ = self.D(x)

            ## Loss ##
            loss = F.cross_entropy(y_pred, y)
            self.log(f"{key}/loss", loss)
            metric_dict[f"{key}"]["loss"] = loss.item()

            ## Accuracy ##
            acc = accuracy(y_pred, y)
            self.log(f"{key}/accuracy", acc)
            metric_dict[f"{key}"]["acc"] = acc.item()

            ## F1 score ##
            f1 = self.f1(y_pred, y)
            self.log(f"{key}/f1", f1)
            metric_dict[f"{key}"]["f1"] = f1.item()

            # Log best value #
            ## Probability of dataset being real ##
            p_real = UPSoftmax(logits)
            self.log(f"{key}/p_real", p_real)

        return metric_dict
Ejemplo n.º 20
0
    def validation_step(self, batch, batch_idx):
        #        metric_dict = {
        #            "u": {"loss": 0, "acc": 0, "f1": 0},
        #            "test": {"loss": 0, "acc": 0, "f1": 0},
        #        }
        #
        # Loop through unlabelled and test loaders to calculate metrics #
        for key, data in batch.items():
            x, y = data
            logits, y_pred, _ = self.D(x)

            ## Loss ##
            loss = F.cross_entropy(logits, y)
            self.log(f"{key}/loss", loss)
            #            metric_dict[f"{key}"]["loss"] = loss.item()

            ## Accuracy ##
            acc = accuracy(y_pred, y)
            self.log(f"{key}/accuracy", acc)
            #            metric_dict[f"{key}"]["acc"] = acc.item()

            ## F1 score ##
            f1 = self.f1(y_pred, y)
            self.log(f"{key}/f1", f1)
def test(test_loader, model, criterion, print_freq):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    #preds = np.zeros((0,7,))
    pred_labels = np.zeros([0,])
    GT_labels = np.zeros([0,])
    for i, (input, target) in enumerate(test_loader):
        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input, volatile=True)
        target_var = torch.autograd.Variable(target, volatile=True)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        '''
        cal_probs = torch.nn.Softmax(dim=0)
        probs = cal_probs(output)
        preds = np.concatenate([preds, probs.data.cpu().numpy()], axis=0)
        '''

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.data[0], input.size(0))
        top1.update(prec1[0], input.size(0))
        top5.update(prec5[0], input.size(0))

        _, pred = output.data.topk(1, 1, True, True)
        pred_labels = np.concatenate([pred_labels, pred.cpu().numpy().flatten()], axis=0)
        GT_labels = np.concatenate([GT_labels, target.cpu().numpy().flatten()], axis=0)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   i, len(test_loader), batch_time=batch_time, loss=losses,
                   top1=top1, top5=top5))

    print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'
          .format(top1=top1, top5=top5))

    categories = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']
    build_confusion_mtx(GT_labels, pred_labels, categories)

    '''
    mean_score, std_score = get_inception_score(preds)
    print(' * IS: mean {mean_score:.3f} std {std_score:.3f}'.format(mean_score=mean_score, std_score=std_score))
    '''

    return top1.avg
Ejemplo n.º 22
0
def main():
    # reading in
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        default='data/sampling',
                        help='determine the base dir of the dataset document')
    parser.add_argument("--sample_n",
                        default=1000,
                        type=int,
                        help='starting image index of preprocessing')
    parser.add_argument("--evidence_n",
                        default=20,
                        type=int,
                        help='how many top/bottom tiles to pick from')
    parser.add_argument("--repl_n",
                        default=3,
                        type=int,
                        help='how many resampled replications')
    parser.add_argument("--image_split",
                        action='store_true',
                        help='if use image_split')
    parser.add_argument("--batch_size",
                        default=50,
                        type=int,
                        help="batch size")
    parser.add_argument("--stage_two",
                        action='store_true',
                        help='if only use stage two patients')
    parser.add_argument("--changhai",
                        action='store_true',
                        help='if use additional data')
    args = parser.parse_args()

    feature_size = 32
    #gpu = "cuda:0"
    gpu = None
    # 5-folds cross validation
    dataloader = CVDataLoader(args, gpu, feature_size)

    n_epoch = 800
    lr = 0.0005
    if args.stage_two:
        weight_decay = 0.008
    else:
        weight_decay = 0.005
    manytimes_n = 8

    if not os.path.isdir('figure'):
        os.mkdir('figure')
    if not os.path.isdir(os.path.join(args.data_dir, 'model')):
        os.mkdir(os.path.join(args.data_dir, 'model'))

    acc_folds = []
    auc_folds = []
    c_index_folds = []
    f1_folds = []
    f1_folds_pos = []
    total_round = 0
    model_count = 0

    loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(0.8))

    for _ in range(manytimes_n):  # averaging
        for i in range(5):
            train_history = []
            test_history = []
            minimum_loss = None
            auc_fold = None
            acc_fold = None
            early_stop_count = 0

            model = Predictor(evidence_size=args.evidence_n,
                              layers=(100, 50, 1),
                              feature_size=feature_size)
            # model.apply(weight_init)
            if gpu:
                model = model.to(gpu)
            optimizer = torch.optim.RMSprop(model.parameters(),
                                            lr=lr,
                                            weight_decay=weight_decay)
            # optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

            dataloader.set_fold(i)
            X_test, Y_test, df_test = dataloader.get_test()
            # X_train, Y_train, df_train = dataloader.get_train()
            print('starting fold %d' % i)

            for epoch in range(n_epoch):
                #result = model(X_train)
                #loss = nn.functional.binary_cross_entropy(result, Y_train) + nn.functional.mse_loss(result, Y_train)
                # loss = nn.functional.mse_loss(result, Y_train)
                #loss.backward()
                #optimizer.step()
                #optimizer.zero_grad()

                # batch input
                for X_train_batch, Y_train_batch, df_train_batch in dataloader:
                    # print(X_train_batch.shape)
                    result = model(X_train_batch)
                    loss = loss_function(result, Y_train_batch)
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()

                X_train, Y_train, df_train = X_train_batch, Y_train_batch, df_train_batch

                if epoch % 20 == 0:
                    result_test = model(X_test)
                    loss_test = loss_function(result_test, Y_test)
                    #loss_test = nn.functional.mse_loss(result_test, Y_test)
                    acc_train, acc_test = accuracy(result, Y_train), accuracy(
                        result_test, Y_test)
                    auc_train, auc_test = auc(result, Y_train), auc(
                        result_test, Y_test)
                    if args.changhai:
                        c_index_train, c_index_test = 0, 0
                    else:
                        c_index_train, c_index_test = c_index(
                            result, df_train), c_index(result_test, df_test)
                    recall_train, recall_test = recall(result,
                                                       Y_train), recall(
                                                           result_test, Y_test)
                    precision_train, precision_test = precision(
                        result, Y_train), precision(result_test, Y_test)
                    f1_train_pos, f1_test_pos = f1(result, Y_train), f1(
                        result_test, Y_test)
                    f1_train, f1_test = f1(result, Y_train,
                                           negative=True), f1(result_test,
                                                              Y_test,
                                                              negative=True)
                    train_history.append(
                        (epoch, loss, acc_train, auc_train, c_index_train))
                    test_history.append(
                        (epoch, loss_test, acc_test, auc_test, c_index_test))
                    if epoch % 40 == 0:
                        print(
                            "%s epoch:%d loss:%.3f/%.3f acc:%.3f/%.3f auc:%.3f/%.3f c_index:%.3f/%.3f recall:%.3f/%.3f prec:%.3f/%.3f f1:%.3f/%.3f f1(neg):%.3f/%.3f"
                            % (time.strftime(
                                '%m.%d %H:%M:%S', time.localtime(
                                    time.time())), epoch, loss, loss_test,
                               acc_train, acc_test, auc_train, auc_test,
                               c_index_train, c_index_test, recall_train,
                               recall_test, precision_train, precision_test,
                               f1_train_pos, f1_test_pos, f1_train, f1_test))
                    # early stop
                    if minimum_loss is None or minimum_loss * 0.995 > loss_test:
                        # if minimum_loss is None or minimum_loss > loss_test:
                        if f1_train == 0:
                            continue
                        minimum_loss = loss_test
                        auc_fold = auc_test
                        acc_fold = acc_test
                        c_index_fold = c_index_test
                        f1_fold_pos = f1_test_pos
                        f1_fold = f1_test
                        early_stop_count = 0
                    elif auc_test > auc_fold and auc_test > 0.5 and acc_test >= acc_fold:
                        minimum_loss = loss_test
                        auc_fold = auc_test
                        acc_fold = acc_test
                        c_index_fold = c_index_test
                        f1_fold_pos = f1_test_pos
                        f1_fold = f1_test
                        early_stop_count = 0
                    else:
                        early_stop_count += 1
                    if early_stop_count > 2 and epoch > 100:
                        if args.stage_two:
                            if auc_fold > 0.55:
                                print('early stop at epoch %d' % epoch)
                                break
                        elif early_stop_count > 3:
                            print('early stop at epoch %d' % epoch)
                            break
                    if epoch > 500:
                        optimizer = torch.optim.RMSprop(
                            model.parameters(),
                            lr * 0.6,
                            weight_decay=weight_decay * 1.2)

            train_history = np.array(train_history)
            test_history = np.array(test_history)
            acc_folds.append(acc_fold)
            auc_folds.append(auc_fold)
            f1_folds.append(f1_fold)
            f1_folds_pos.append(f1_fold_pos)
            c_index_folds.append(c_index_fold)
            plt.plot(train_history[:, 0], train_history[:, 1], label='train')
            plt.plot(test_history[:, 0], test_history[:, 1], label='test')
            plt.legend()
            plt.savefig('figure/sample_%d_fold%d.png' % (args.sample_n, i))
            plt.cla()
            if acc_fold > 0.7 and auc_fold > 0.6 and model_count < 10:
                model.save(args.data_dir + "/model/model_%d" % model_count)
                model_count += 1
            print("acc:%.3f\tauc:%.3f\tc_index:%.3f\tf1:%.3f" %
                  (acc_fold, auc_fold, c_index_fold, f1_fold))
            total_round += 1
            if gpu:
                del dataloader.X_train, dataloader.Y_train, dataloader.X_test, dataloader.Y_test
                del X_test, Y_test, X_train, Y_train, model, optimizer
                torch.cuda.empty_cache()

    print('CV-acc:%.3f CV-auc:%.3f CV-c-index:%.3f f1:%.3f f1(neg):%.3f' %
          (sum(acc_folds) / 5 / manytimes_n, sum(auc_folds) / 5 / manytimes_n,
           sum(c_index_folds) / 5 / manytimes_n, sum(f1_folds_pos) / 5 /
           manytimes_n, sum(f1_folds) / 5 / manytimes_n))
Ejemplo n.º 23
0
pc = evaluation.pairs_completeness(cand_rec_id_pair_list, true_match_set)
pq = evaluation.pairs_quality(cand_rec_id_pair_list, true_match_set)

print('Blocking evaluation:')
print('  Reduction ratio:    %.3f' % (rr))
print('  Pairs completeness: %.3f' % (pc))
print('  Pairs quality:      %.3f' % (pq))
print('')

# Linkage evaluation
#
linkage_result = evaluation.confusion_matrix(class_match_set,
                                             class_nonmatch_set,
                                             true_match_set, all_comparisons)

accuracy = evaluation.accuracy(linkage_result)
precision = evaluation.precision(linkage_result)
recall = evaluation.recall(linkage_result)
fmeasure = evaluation.fmeasure(linkage_result)

print('Linkage evaluation:')
print('  Accuracy:    %.3f' % (accuracy))
print('  Precision:   %.3f' % (precision))
print('  Recall:      %.3f' % (recall))
print('  F-measure:   %.3f' % (fmeasure))
print('')

linkage_time = loading_time + blocking_time + comparison_time + \
               classification_time
print('Total runtime required for linkage: %.3f sec' % (linkage_time))
Ejemplo n.º 24
0
# Get cross validation accuracy for 5-fold cv
print("Ionosphere validation accuracy (default parameters):")
evaluation.cross_validation(5, ionosphere_train_features, ionosphere_train_labels, model=LogisticRegression)

# Grid search for optimal hyperparameters
print("Ionosphere grid search hyperparameters:")
ionosphere_max_val_acc, ionosphere_arg_max = evaluation.grid_search(learning_rates=lrs, epsilons=eps, lambdas=lamdas, x=ionosphere_train_features, y=ionosphere_train_labels, model=LogisticRegression)

# Accuracy on test split - train with best hyperparameters
print("Ionosphere test accuracy:")
logistic_ionosphere = LogisticRegression(ionosphere_train_features, ionosphere_train_labels)
logistic_ionosphere.fit(lr=ionosphere_arg_max[0], eps=ionosphere_arg_max[1], regularization=ionosphere_arg_max[2])
ionosphere_prediction = logistic_ionosphere.predict(ionosphere_test_features)
cm_ionosphere = evaluation.confusion_matrix(ionosphere_test_labels, ionosphere_prediction)
print("Accuracy:", evaluation.accuracy(cm_ionosphere), "Precision:", evaluation.precision(cm_ionosphere), "Recall:", evaluation.true_positive(cm_ionosphere), "F1:", evaluation.f_score(cm_ionosphere))

# 5-fold CV for naive bayes
print("Ionosphere validation accuracy (naive bayes):")
evaluation.cross_validation_naive(5, ionosphere_dataset.train_data, NaiveBayes, ionosphere_dataset.label_column, ionosphere_dataset.feature_columns)

naive_ionosphere = NaiveBayes(ionosphere_dataset.train_data, ionosphere_dataset.label_column, continuous=ionosphere_dataset.feature_columns)

print("Ionosphere test accuracy (naive bayes):")

ionosphere_pred_naive = ionosphere_dataset.test_data.apply(naive_ionosphere.predict, axis=1)
cm_ionosphere_naive = evaluation.confusion_matrix(ionosphere_test_labels, ionosphere_pred_naive.to_numpy())
print("Accuracy:", evaluation.accuracy(cm_ionosphere_naive), "Precision:", evaluation.precision(cm_ionosphere_naive), "Recall:", evaluation.true_positive(cm_ionosphere_naive), "F1:", evaluation.f_score(cm_ionosphere_naive))


# Abalone -----
Ejemplo n.º 25
0
 mean_b1 = 0
 mean_b2 = 0
 mean_b3 = 0
 for i, file in enumerate(files):
     K.clear_session()
     model = load_model(save_path + '/' + file,
                        custom_objects={
                            "Flip_Attention":
                            Flip_Attention,
                            "lossFunction":
                            customLoss(K.variable(np.ones((1, 1))), 0.2)
                        })
     evaluate = model.evaluate(x_test, y_test)
     score = model.predict(test_embed)
     mean_acc += evaluate[1]
     bais_acc1 = accuracy(score, label, 0.45, 0.55)
     mean_b1 += float(bais_acc1)
     bais_acc2 = accuracy(score, label, 0.40, 0.60)
     mean_b2 += float(bais_acc2)
     bais_acc3 = accuracy(score, label, 0.35, 0.65)
     mean_b3 += float(bais_acc3)
     print(file, evaluate[1], bais_acc1, bais_acc2, bais_acc3)
     #        if evaluate[1] > max_acc:
     #            max_acc = evaluate[1]
     #            best_name = file
     del model
     gc.collect()
     logging.info(file + ' ' + str(evaluate[1]) + ' ' + str(bais_acc1) +
                  str(bais_acc2) + str(bais_acc3))
 print('mean_acc:', mean_acc / times)
 print('mean_b1:', mean_b1 / times)
Ejemplo n.º 26
0
    def train(self, epochs=10):
        if self.__xtrain and self.__ytrain and self.__xtest and self.__ytest:
            pass
        else:
            self.__load_dataset()

        # Open a writer to write summaries.
        self.__writer = tf.summary.FileWriter(self.__TMP_DIR,
                                              self.__session.graph)

        for epoch in range(epochs):
            #learning_rate = self.__session.run(self.__lr)
            #print('Learning rate', learning_rate)

            average_loss = 0
            num_steps = len(self.__flow)

            for step in tqdm.tqdm(range(num_steps),
                                  desc='Epoch ' +
                                  str(epoch + 1 + self.__GLOBAL_EPOCH) + '/' +
                                  str(epochs + self.__GLOBAL_EPOCH)):

                batch, label = self.__flow.next()

                run_metadata = tf.RunMetadata()
                _, l = self.__session.run([self.__train_op, self.__loss],
                                          feed_dict={
                                              self.__images: batch,
                                              self.__labels: label
                                          },
                                          run_metadata=run_metadata)

                average_loss += l

                # print loss and accuracy on test set at the and of each epoch
                if step == num_steps - 1:

                    y_true = []
                    y_pred = []

                    for i in range(len(self.__xtest)):
                        prediction = self.__session.run(
                            self.__labels_predicted,
                            feed_dict={self.__images: [self.__xtest[i]]},
                            run_metadata=run_metadata)

                        y_true.append(self.__ytest[i])
                        y_pred.append(prediction[0])

                    accuracy = ev.accuracy(y_true, y_pred)

                    print('Loss:', str(average_loss / step), '\tAccuracy:',
                          accuracy)

                    with open(self.__TMP_DIR + '/log.txt',
                              'a',
                              encoding='utf8') as f:
                        f.write(
                            str(accuracy) + ' ' + str(average_loss / step) +
                            '\n')

                if step == (num_steps - 1) and epoch + 1 == epochs:
                    s = self.__session.run(self.__global_step)
                    self.__writer.add_run_metadata(run_metadata,
                                                   'step%d' % step,
                                                   global_step=s)

        self.__saver.save(self.__session,
                          os.path.join(self.__TMP_DIR, 'model.ckpt'))
        dp.global_epoch(self.__TMP_DIR + 'epoch.txt',
                        update=self.__GLOBAL_EPOCH + epochs)

        self.__writer.close()

        pg.generate_accuracy_plot(data_dir=self.__TMP_DIR)
        pg.generate_loss_plot(data_dir=self.__TMP_DIR)

        conf_mat = ev.confusion_matrix(y_true, y_pred, len(self.__SENTIMENTS))
        pg.generate_confusion_matrix_plot(conf_mat,
                                          self.__SENTIMENTS,
                                          data_dir=self.__TMP_DIR)
        pg.generate_confusion_matrix_plot(conf_mat,
                                          self.__SENTIMENTS,
                                          normalize=True,
                                          data_dir=self.__TMP_DIR)
Ejemplo n.º 27
0
def run_scheme(scheme, descriptor_type, descriptor_param, num_clusters,
               clf_params, plotGraphs, PCAon, num_cols):
    print "Running scheme with the following parameters: "
    print "Scheme num: " + str(scheme) + ", BoVW: num_clusters=" + str(num_clusters) +\
        "; SVM: params:" + str(clf_params) + ";\n plotGraphs=" + str(plotGraphs) +\
          "; PCA_on=" + str(PCAon)
    start = time.time()

    # 1) Read the train and test files
    train_images_filenames = cPickle.load(
        open('train_images_filenames.dat', 'r'))
    test_images_filenames = cPickle.load(open('test_images_filenames.dat',
                                              'r'))
    train_labels = cPickle.load(open('train_labels.dat', 'r'))
    test_labels = cPickle.load(open('test_labels.dat', 'r'))
    print 'Loaded ' + str(
        len(train_images_filenames)) + ' training images filenames\
     with classes ', set(train_labels)
    print 'Loaded ' + str(
        len(test_images_filenames)) + ' testing images filenames\
     with classes ', set(test_labels)

    # 2) Extract features (train)
    D, Train_descriptors, kpt_dense, pca_train, sclr_train = computeTraining_descriptors(
        descriptor_type, descriptor_param, train_images_filenames,
        train_labels, PCAon, num_cols)

    # 3) Reduce number of features by PCA (reducing m=128 cols)
    #   Computed internally in computeTraining_descriptors()
    # 4) Compute codebook
    codebook = computeCodebook(num_clusters, D, descriptor_type,
                               descriptor_param, PCAon)
    # 5) Get training BoVW
    train_VW = getBoVW_train(codebook, num_clusters, Train_descriptors)

    # 6) Train SVM
    clf, train_scaler, D_scaled = clf_train(train_VW, train_labels, clf_params)

    # 7) Get test BoVW
    test_VW = getBoVW_test(codebook, num_clusters, test_images_filenames,
                           descriptor_type, descriptor_param, kpt_dense, PCAon,
                           pca_train, sclr_train)

    # 8) Get evaluation (accuracy, f-score, graphs, etc.)
    predictions = clf_predict(clf, clf_params, train_scaler, test_VW, D_scaled)
    # Get metrics and graphs:
    # We need to implement our own for latter integration with the rest of the project
    # Accuracy, F-score (multi-class=> average? add up?)

    acc = accuracy(test_labels, predictions)
    prec = precision(test_labels, predictions)
    rec = recall(test_labels, predictions)
    f1sc = f1score(test_labels, predictions)
    cm = confusionMatrix(test_labels, predictions)
    hits, misses = HitsAndMisses(cm)
    print "Confusion matrix:\n"
    print(str(cm))
    print("\n")
    print "Results (metrics):\n" + "Accuracy= {:04.2f}%\n" \
                                   "Precision= {:04.2f}%\n" \
                                   "Recall= {:04.2f}%\n" \
                                   "F1-score= {:04.2f}%\n" \
                                   "Hits(TP)={:d}\n" \
                                   "Misses(FN)={:d}\n".format(
        100*acc, 100*prec, 100*rec, 100*f1sc, hits, misses)
    print("\n")
    if plotGraphs:
        # Plot confusion matrix (and any other graph)
        print "Plotting confusion matrix..."
        plotConfusionMatrix(cm, test_labels)

    end = time.time()
    print 'Everything done in ' + str(end - start) + ' secs.'
Ejemplo n.º 28
0
    zh = energy_multi_random(X, z, 30)
    a = accuracy(z, zh)
    print "Random / Energy:", a
    """

    for d in [5, 10, 15, 20, 25, 30, 50, 100, 200, 300, 500, 1000, 2000, 5000]:
        n = 1000
        m1 = np.zeros(d)
        m1[range(0, d, 2)] = 1
        s1 = np.eye(d)
        m2 = np.zeros(d)
        m2[range(0, d, 2)] = -1
        s2 = np.eye(d)
        X, z = two_gaussians(m1, s1, m2, s2, n)

        zh = kmeans(X)
        a_kmeans = accuracy(z, zh)

        Y = pca_projection(X)
        zh = kmeans(Y)
        a_pca = accuracy(z, zh)

        zh = kmeans_multi_random(X, z, 100)
        a_krandom = accuracy(z, zh)

        zh = energy_multi_random(X, z, 100)
        a_erandom = accuracy(z, zh)

        print "%i & %f & %f & %f & %f \\\\" % (d, a_kmeans, a_pca, a_krandom,
                                               a_erandom)
Ejemplo n.º 29
0
num_of_documents = args.qDocs

for p_doc in list(enumerate(corpora.positives[:num_of_documents])):
	print "extracting ngrams from positive documents"
	print p_doc[0]
	pp.extract_ngrams(p_doc[1], stopwords=args.stopwords)
	clear()

for n_doc in list(enumerate(corpora.negatives[:num_of_documents])):
	print "extracting ngrams from negative documents"
	print n_doc[0]
	pp.extract_ngrams(n_doc[1], stopwords=args.stopwords)
	clear()

print "____________________CLASSIFICATION STAGE____________________"
all_documents = corpora.positives[:num_of_documents] + corpora.negatives[:num_of_documents]
classifier = classification.OhanaBrendan(all_documents)
classifier.rule = args.tags
classifier.term_counting()

print "____________________EVALUATION STAGE____________________"
print args
print
print "Precision"
print str(eval.precision(len(corpora.positives[:num_of_documents]), corpora.negatives[:num_of_documents]) * decimal.Decimal(100)) + ' %'
print "Recall"
print str(eval.recall(len(corpora.positives[:num_of_documents]), corpora.positives[:num_of_documents]) * decimal.Decimal(100)) + ' %'
print "Accuracy"
print str(eval.accuracy(len(corpora.positives), len(corpora.negatives), all_documents) * decimal.Decimal(100)) + ' %'

Ejemplo n.º 30
0
def test_gcn_e(model, cfg, logger):
    for k, v in cfg.model['kwargs'].items():
        setattr(cfg.test_data, k, v)
    dataset = build_dataset(cfg.model['type'], cfg.test_data)

    pred_peaks = dataset.peaks
    pred_dist2peak = dataset.dist2peak

    ofn_pred = osp.join(cfg.work_dir, 'pred_conns.npz')
    if osp.isfile(ofn_pred) and not cfg.force:
        data = np.load(ofn_pred)
        pred_conns = data['pred_conns']
        inst_num = data['inst_num']
        if inst_num != dataset.inst_num:
            logger.warn(
                'instance number in {} is different from dataset: {} vs {}'.
                format(ofn_pred, inst_num, len(dataset)))
    else:
        if cfg.random_conns:
            pred_conns = []
            for nbr, dist, idx in zip(dataset.subset_nbrs,
                                      dataset.subset_dists,
                                      dataset.subset_idxs):
                for _ in range(cfg.max_conn):
                    pred_rel_nbr = np.random.choice(np.arange(len(nbr)))
                    pred_abs_nbr = nbr[pred_rel_nbr]
                    pred_peaks[idx].append(pred_abs_nbr)
                    pred_dist2peak[idx].append(dist[pred_rel_nbr])
                    pred_conns.append(pred_rel_nbr)
            pred_conns = np.array(pred_conns)
        else:
            pred_conns = test(model, dataset, cfg, logger)
            for pred_rel_nbr, nbr, dist, idx in zip(pred_conns,
                                                    dataset.subset_nbrs,
                                                    dataset.subset_dists,
                                                    dataset.subset_idxs):
                pred_abs_nbr = nbr[pred_rel_nbr]
                pred_peaks[idx].extend(pred_abs_nbr)
                pred_dist2peak[idx].extend(dist[pred_rel_nbr])
        inst_num = dataset.inst_num

    if len(pred_conns) > 0:
        logger.info(
            'pred_conns (nbr order): mean({:.1f}), max({}), min({})'.format(
                pred_conns.mean(), pred_conns.max(), pred_conns.min()))

    if not dataset.ignore_label and cfg.eval_interim:
        subset_gt_labels = dataset.subset_gt_labels
        for i in range(cfg.max_conn):
            pred_peaks_labels = np.array([
                dataset.idx2lb[pred_peaks[idx][i]]
                for idx in dataset.subset_idxs
            ])

            acc = accuracy(pred_peaks_labels, subset_gt_labels)
            logger.info(
                '[{}-th] accuracy of pred_peaks labels ({}): {:.4f}'.format(
                    i, len(pred_peaks_labels), acc))

            # the rule for nearest nbr is only appropriate when nbrs is sorted
            nearest_idxs = np.where(pred_conns[:, i] == 0)[0]
            acc = accuracy(pred_peaks_labels[nearest_idxs],
                           subset_gt_labels[nearest_idxs])
            logger.info(
                '[{}-th] accuracy of pred labels (nearest: {}): {:.4f}'.format(
                    i, len(nearest_idxs), acc))

            not_nearest_idxs = np.where(pred_conns[:, i] > 0)[0]
            acc = accuracy(pred_peaks_labels[not_nearest_idxs],
                           subset_gt_labels[not_nearest_idxs])
            logger.info(
                '[{}-th] accuracy of pred labels (not nearest: {}): {:.4f}'.
                format(i, len(not_nearest_idxs), acc))

    with Timer('Peaks to clusters (th_cut={})'.format(cfg.tau)):
        pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau,
                                      inst_num)

    if cfg.save_output:
        logger.info(
            'save predicted connectivity and labels to {}'.format(ofn_pred))
        if not osp.isfile(ofn_pred) or cfg.force:
            np.savez_compressed(ofn_pred,
                                pred_conns=pred_conns,
                                inst_num=inst_num)

        # save clustering results
        idx2lb = list2dict(pred_labels, ignore_value=-1)

        folder = '{}_gcne_k_{}_th_{}_ig_{}'.format(cfg.test_name, cfg.knn,
                                                   cfg.th_sim,
                                                   cfg.test_data.ignore_ratio)
        opath_pred_labels = osp.join(cfg.work_dir, folder,
                                     'tau_{}_pred_labels.txt'.format(cfg.tau))
        mkdir_if_no_exists(opath_pred_labels)
        write_meta(opath_pred_labels, idx2lb, inst_num=inst_num)

    # evaluation
    if not dataset.ignore_label:
        print('==> evaluation')
        for metric in cfg.metrics:
            evaluate(dataset.gt_labels, pred_labels, metric)

        # H and C-scores
        gt_dict = {}
        pred_dict = {}
        for i in range(len(dataset.gt_labels)):
            gt_dict[str(i)] = dataset.gt_labels[i]
            pred_dict[str(i)] = pred_labels[i]
        bm = ClusteringBenchmark(gt_dict)
        scores = bm.evaluate_vmeasure(pred_dict)
        # fmi_scores = bm.evaluate_fowlkes_mallows_score(pred_dict)
        print(scores)
Ejemplo n.º 31
0
print
print "Some documents couldn't be predicted, then they was assigned with None and will not be evaluated"
positive_docs_non_predicted = 0
list_of_true_negative_documents = []
for tn in corpora.negatives[:num_of_documents]:
	if tn.predicted_polarity:
		list_of_true_negative_documents.append(tn)
	else:
		positive_docs_non_predicted += 1

negative_docs_non_predicted = 0
list_of_true_positive_documents = []
for tp in corpora.positives[:num_of_documents]:
	if tp.predicted_polarity:
		list_of_true_positive_documents.append(tp)
	else:
		negative_docs_non_predicted += 1

print "Positive docs non predicted: " + str(positive_docs_non_predicted)
print "Negative docs non predicted: " + str(negative_docs_non_predicted)
print

print "Precision"
print str(eval.precision(len(corpora.positives[:num_of_documents]), list_of_true_negative_documents, ref=0.5) * decimal.Decimal(100)) + ' %'
print "Recall"
print str(eval.recall(len(corpora.positives[:num_of_documents]), list_of_true_positive_documents, ref=0.5) * decimal.Decimal(100)) + ' %'
print "Accuracy"
print str(eval.accuracy(len(corpora.positives), len(corpora.negatives),
				list_of_true_positive_documents + list_of_true_negative_documents, ref=0.5) * decimal.Decimal(100)) + ' %'

Ejemplo n.º 32
0
def test_gcn_v(model, cfg, logger):
    for k, v in cfg.model['kwargs'].items():
        setattr(cfg.test_data, k, v)
    dataset = build_dataset(cfg.model['type'], cfg.test_data)

    folder = '{}_gcnv_k_{}_th_{}'.format(cfg.test_name, cfg.knn, cfg.th_sim)
    oprefix = osp.join(cfg.work_dir, folder)
    oname = osp.basename(rm_suffix(cfg.load_from))
    opath_pred_confs = osp.join(oprefix, 'pred_confs', '{}.npz'.format(oname))

    if osp.isfile(opath_pred_confs) and not cfg.force:
        data = np.load(opath_pred_confs)
        pred_confs = data['pred_confs']
        inst_num = data['inst_num']
        if inst_num != dataset.inst_num:
            logger.warn(
                'instance number in {} is different from dataset: {} vs {}'.
                format(opath_pred_confs, inst_num, len(dataset)))
    else:
        pred_confs, gcn_feat = test(model, dataset, cfg, logger)
        inst_num = dataset.inst_num

    logger.info('pred_confs: mean({:.4f}). max({:.4f}), min({:.4f})'.format(
        pred_confs.mean(), pred_confs.max(), pred_confs.min()))

    logger.info('Convert to cluster')
    with Timer('Predition to peaks'):
        pred_dist2peak, pred_peaks = confidence_to_peaks(
            dataset.dists, dataset.nbrs, pred_confs, cfg.max_conn)

    if not dataset.ignore_label and cfg.eval_interim:
        # evaluate the intermediate results
        for i in range(cfg.max_conn):
            num = len(dataset.peaks)
            pred_peaks_i = np.arange(num)
            peaks_i = np.arange(num)
            for j in range(num):
                if len(pred_peaks[j]) > i:
                    pred_peaks_i[j] = pred_peaks[j][i]
                if len(dataset.peaks[j]) > i:
                    peaks_i[j] = dataset.peaks[j][i]
            acc = accuracy(pred_peaks_i, peaks_i)
            logger.info('[{}-th conn] accuracy of peak match: {:.4f}'.format(
                i + 1, acc))
            acc = 0.
            for idx, peak in enumerate(pred_peaks_i):
                acc += int(dataset.idx2lb[peak] == dataset.idx2lb[idx])
            acc /= len(pred_peaks_i)
            logger.info(
                '[{}-th conn] accuracy of peak label match: {:.4f}'.format(
                    i + 1, acc))

    with Timer('Peaks to clusters (th_cut={})'.format(cfg.tau_0)):
        pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau_0,
                                      inst_num)

    if cfg.save_output:
        logger.info('save predicted confs to {}'.format(opath_pred_confs))
        mkdir_if_no_exists(opath_pred_confs)
        np.savez_compressed(opath_pred_confs,
                            pred_confs=pred_confs,
                            inst_num=inst_num)

        # save clustering results
        idx2lb = list2dict(pred_labels, ignore_value=-1)

        opath_pred_labels = osp.join(
            cfg.work_dir, folder, 'tau_{}_pred_labels.txt'.format(cfg.tau_0))
        logger.info('save predicted labels to {}'.format(opath_pred_labels))
        mkdir_if_no_exists(opath_pred_labels)
        write_meta(opath_pred_labels, idx2lb, inst_num=inst_num)

    # evaluation
    if not dataset.ignore_label:
        print('==> evaluation')
        for metric in cfg.metrics:
            evaluate(dataset.gt_labels, pred_labels, metric)

    if cfg.use_gcn_feat:
        # gcn_feat is saved to disk for GCN-E
        opath_feat = osp.join(oprefix, 'features', '{}.bin'.format(oname))
        if not osp.isfile(opath_feat) or cfg.force:
            mkdir_if_no_exists(opath_feat)
            write_feat(opath_feat, gcn_feat)

        name = rm_suffix(osp.basename(opath_feat))
        prefix = oprefix
        ds = BasicDataset(name=name,
                          prefix=prefix,
                          dim=cfg.model['kwargs']['nhid'],
                          normalize=True)
        ds.info()

        # use top embedding of GCN to rebuild the kNN graph
        with Timer('connect to higher confidence with use_gcn_feat'):
            knn_prefix = osp.join(prefix, 'knns', name)
            knns = build_knns(knn_prefix,
                              ds.features,
                              cfg.knn_method,
                              cfg.knn,
                              is_rebuild=True)
            dists, nbrs = knns2ordered_nbrs(knns)

            pred_dist2peak, pred_peaks = confidence_to_peaks(
                dists, nbrs, pred_confs, cfg.max_conn)
            pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau,
                                          inst_num)

        # save clustering results
        if cfg.save_output:
            oname_meta = '{}_gcn_feat'.format(name)
            opath_pred_labels = osp.join(
                oprefix, oname_meta, 'tau_{}_pred_labels.txt'.format(cfg.tau))
            mkdir_if_no_exists(opath_pred_labels)

            idx2lb = list2dict(pred_labels, ignore_value=-1)
            write_meta(opath_pred_labels, idx2lb, inst_num=inst_num)

        # evaluation

        if not dataset.ignore_label:
            print('==> evaluation')
            for metric in cfg.metrics:
                evaluate(dataset.gt_labels, pred_labels, metric)
        import json
        import os
        import pdb
        pdb.set_trace()
        img_labels = json.load(
            open(r'/home/finn/research/data/clustering_data/test_index.json',
                 'r',
                 encoding='utf-8'))
        import shutil
        output = r'/home/finn/research/data/clustering_data/mr_gcn_output'
        for label in set(pred_labels):
            if not os.path.exists(os.path.join(output, f'cluter_{label}')):
                os.mkdir(os.path.join(output, f'cluter_{label}'))
        for image in img_labels:
            shutil.copy2(
                image,
                os.path.join(
                    os.path.join(output,
                                 f'cluter_{pred_labels[img_labels[image]]}'),
                    os.path.split(image)[-1]))