コード例 #1
0
def getTrainedModel(training_set, test_set, n_round):
    '''
    @param  training_set    This is a list of pairs, where each pair is
                            (user_id, label)
    @param  test_set        This is in the same format as training_set
    @return model           Trained model which expose eval_all() method
    '''
    # {{{

    # TRAINING phase
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    # add event reading the file one by one
    for user_id, label in training_set:
        context = getContext(user_id)

        if context is None:
            continue

        weight = 1.0
        m.add_event(context, label, weight)

    m.end_add_event(1)
    m.train(100, 'lbfgs', 1e1, 1e-4)

    # TEST phase
    confusion_matrix = np.zeros([4,4])

    for user_id, label in test_set:
        context = getContext(user_id)

        if context is None:
            continue

        weight = 1.0
        predictions = m.eval_all(context)
        predicted_target = predictions[0][0]

        label = int(label)
        predicted_target = int(predicted_target)

        confusion_matrix[label][predicted_target] += 1


    accuracy = np.trace(confusion_matrix) / float(confusion_matrix.sum())

    # Write the log
    logAndPrint('Round %d; Confusion Matrix' % n_round)
    logAndPrint(str(confusion_matrix))
    logAndPrint('Test Accuracy: %f\n' % accuracy)

    return m    # }}}
コード例 #2
0
ファイル: caliplot.py プロジェクト: pyongjoo/twitter-research
def caliplot():
    model = getLearner()

    test_file = '../../data/semi/train_test_hardlabel/test0'

    confidence_bins = np.zeros([4, BIN_SIZE], dtype = float)
    correct_guess_bins = np.zeros([4, BIN_SIZE], dtype = float)

    for line in open(test_file):
        user_id, target = line.rstrip('\n').split('\t')
        context = getContext(user_id)

        if context is None:
            continue

        weight = 1.0
        predictions = model.eval_all(context)
        predicted_target = int(predictions[0][0])
        with_conf_bin = int(predictions[0][1] * BIN_SIZE)

        for label, confidence in predictions:
            conf_bin = int(confidence * BIN_SIZE)
            confidence_bins[int(label)][conf_bin] += 1

        correct_guess_bins[predicted_target][with_conf_bin] += 1

    print correct_guess_bins / confidence_bins
コード例 #3
0
def addInstancesFromFile(model, file_name):
    counter = 0     # count the number of instances successfully added

    for line in open(file_name):
        user_id, target = line.rstrip('\n').split('\t')
        context = getContext(user_id)
        if context is None:
            continue
        weight = 1.0
        model.add_event(context, target, weight)
        counter += 1

    print "A total of %d instances have been added into the model." % counter

    return model
コード例 #4
0
def evaluate(model, unlabeled_set, n_round):
    # {{{

    evaluation_results = [[] for x in range(4)]

    print 'start to evaluate unlabeled set of size %d.' % len(unlabeled_set)

    count = 1

    for user_id in unlabeled_set:
        context = getContext(user_id)

        if context is None:
            continue

        predictions = model.eval_all(context)
        predicted_target, score = predictions[0]
        predicted_target = int(predicted_target)

        evaluation_results[predicted_target].append((user_id, score))

        if count % 100 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()

        if count % 1000 == 0:
            print '%d out of %d data evaluated.' % (count, len(unlabeled_set))

        count += 1


    # let's see the confidence distribution of the pseudo labeled data
    num_bins = 10
    confidence_bins = np.zeros([4, num_bins], dtype = int)

    for label, eval_list in enumerate(evaluation_results):
        for user_id, score in eval_list:
            bin_idx = int(score * num_bins)
            confidence_bins[label][bin_idx] += 1

    # write log
    logAndPrint('Round %d; Confidence distribution' % n_round)
    logAndPrint(str(confidence_bins) + '\n')

    return evaluation_results   # }}}