コード例 #1
0
def evaluate(model, page_tuples, page_labels, page_num_labels):
    TP, TN, FP, FN = score_errors(model, page_tuples, page_labels,
                                  page_num_labels, True, TVDBG_P, TVDBG_L)
    matthews = score_matthews(TP, TN, FP, FN, TVDBG_P, TVDBG_L)
    page_total, page_count = get_page_count(page_labels)
    wgt_avg = weighted_average(page_total, page_count, matthews)
    print 'weighted average = {} matthews={}'.format(wgt_avg, matthews)
コード例 #2
0
def normalize(model, page_tuples, page_labels, page_num_labels):
    num_pages = len(page_labels)
    num_labels = len(page_labels[0])
    num_tuples = len(page_tuples[0])
    page_total, page_count = get_page_count(page_labels)
    # this is very inefficient
    score_right = [[] for l in range(num_labels)]
    score_wrong = [[] for l in range(num_labels)]
    score_both = [[] for l in range(num_labels)]
    for p in range(num_pages):
        # scores = classifier_tuples(model, page_tuples, p)
        scores = classifier_tuples(model, page_tuples, p, TVDBG_P, TVDBG_L)
        # if p == 0: print 'tvdbg before p={} scores={}'.format(p, scores)
        for l in range(num_labels):
            if scores[l] > 0:
                score_both[l].append(scores[l])
                if page_labels[p][l] == 1:
                    score_right[l].append(scores[l])
                else:
                    score_wrong[l].append(scores[l])
    # need to remove all labels with score_both[l] <= 1
    for l in range(num_labels):
        min_errs = num_pages
        if len(score_both[l]) <= 1:
            print 'ERROR: set factor to 0 since only {} positive scores for label {} = {}'.format(
                len(score_both[l]), l, model['labels'][l])
            factor = 0.0
        else:
            for i in range(1, len(score_both[l])):
                score_both[l] = sorted(score_both[l])
                # a false negative at level score_both[p] is a value score_right[p] < score_both[p]
                FN = 0
                for j in range(len(score_right[l])):
                    if score_right[l][j] < score_both[l][i]:
                        FN += 1
                # a false positive at level score_both[p] is a value score_wrong[p] >= score_both[p]
                FP = 0
                for j in range(len(score_wrong[l])):
                    if score_wrong[l][j] >= score_both[l][i]:
                        FP += 1
                if min_errs > (FN + FP):
                    min_errs = FN + FP
                    factor = (score_both[l][i - 1] + score_both[l][i]) / 2.0
        for t in range(num_tuples):
            learner = model['learners'][t]
            if factor == 0.0:
                learner['c1'][l] = 0.0
                learner['c0'][l] = 0.0
            else:
                learner['c1'][l] = learner['c1'][l] / factor
                learner['c0'][l] = learner['c0'][l] / factor
    for p in range(num_pages):
        # scores = classifier_tuples(model, page_tuples, p)
        scores = classifier_tuples(model, page_tuples, p, TVDBG_P, TVDBG_L)
        # if p == 0: print 'tvdbg after p={} scores={}'.format(p, scores)
    return model
コード例 #3
0
ファイル: normalize.py プロジェクト: vandermude/Portfolio
def normalize(model, page_tuples, page_labels, page_num_labels):
    num_pages = len(page_labels)
    num_labels = len(page_labels[0])
    num_tuples = len(page_tuples[0])
    page_total, page_count = get_page_count(page_labels)
    # this is very inefficient
    score_right = [[] for l in range(num_labels)]
    score_wrong = [[] for l in range(num_labels)]
    score_both = [[] for l in range(num_labels)]
    for p in range(num_pages):
        # scores = classifier_tuples(model, page_tuples, p)
        scores = classifier_tuples(model, page_tuples, p, TVDBG_P, TVDBG_L)
        # if p == 0: print 'tvdbg before p={} scores={}'.format(p, scores)
        for l in range(num_labels):
            if scores[l] > 0:
                score_both[l].append(scores[l])
                if page_labels[p][l] == 1:
                    score_right[l].append(scores[l])
                else:
                    score_wrong[l].append(scores[l])
    # need to remove all labels with score_both[l] <= 1
    for l in range(num_labels):
        min_errs = num_pages
        if len(score_both[l]) <= 1:
            print 'ERROR: set factor to 0 since only {} positive scores for label {} = {}'.format(len(score_both[l]), l, model['labels'][l])
            factor = 0.0
        else:
            for i in range(1, len(score_both[l])):
                score_both[l] = sorted(score_both[l])
                # a false negative at level score_both[p] is a value score_right[p] < score_both[p]
                FN = 0
                for j in range(len(score_right[l])):
                    if score_right[l][j] < score_both[l][i]:
                        FN += 1
                # a false positive at level score_both[p] is a value score_wrong[p] >= score_both[p]
                FP = 0
                for j in range(len(score_wrong[l])):
                    if score_wrong[l][j] >= score_both[l][i]:
                        FP += 1
                if min_errs > (FN + FP):
                    min_errs = FN + FP
                    factor = (score_both[l][i - 1] + score_both[l][i]) / 2.0
        for t in range(num_tuples):
            learner = model['learners'][t]
            if factor == 0.0:
                learner['c1'][l] = 0.0
                learner['c0'][l] = 0.0
            else:
                learner['c1'][l] = learner['c1'][l] / factor
                learner['c0'][l] = learner['c0'][l] / factor
    for p in range(num_pages):
        # scores = classifier_tuples(model, page_tuples, p)
        scores = classifier_tuples(model, page_tuples, p, TVDBG_P, TVDBG_L)
        # if p == 0: print 'tvdbg after p={} scores={}'.format(p, scores)
    return model
コード例 #4
0
ファイル: scorecnt.py プロジェクト: vandermude/Portfolio
def main():
    """
    Command Line Inputs:
    Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input Model file with labels and array of weak learners
    Output the positive score TP / (TP + FP) for each label
    """
    model = read_model(sys.argv[2])
    # print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
    page_tuples, page_labels, page_num_labels = read_data_labels(sys.argv[1], model)
    # print 'read {} pages'.format(len(page_labels))
    TP, TN, FP, FN = score_errors(model, page_tuples, page_labels, page_num_labels, False, TVDBG_P, TVDBG_L)
    score = score_positive(TP, TN, FP, FN, TVDBG_P, TVDBG_L)
    print 'score = {}'.format(score)
    page_total, page_count = get_page_count(page_labels)
    wgt_avg = weighted_average(page_total, page_count, score)
    print 'weighted average = {}'.format(wgt_avg)
コード例 #5
0
ファイル: score.py プロジェクト: vandermude/Portfolio
def main():
    """
    Command Line Inputs:
    Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input Model file with labels and array of weak learners
    Output the matthews correlation coefficient for each label
    """
    model = read_model(sys.argv[2])
    # print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
    page_tuples, page_labels, page_num_labels = read_data_labels(sys.argv[1], model)
    # print 'read {} pages'.format(len(page_labels))
    TP, TN, FP, FN = score_errors(model, page_tuples, page_labels, page_num_labels, False)
    matthews = score_matthews(TP, TN, FP, FN)
    print 'score = {}'.format(matthews)
    page_total, page_count = get_page_count(page_labels)
    wgt_avg = weighted_average(page_total, page_count, matthews)
    print 'weighted average = {}'.format(wgt_avg)
コード例 #6
0
ファイル: scorecnt.py プロジェクト: vandermude-zz/Portfolio
def main():
    """
    Command Line Inputs:
    Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input Model file with labels and array of weak learners
    Output the positive score TP / (TP + FP) for each label
    """
    model = read_model(sys.argv[2])
    # print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
    page_tuples, page_labels, page_num_labels = read_data_labels(sys.argv[1], model)
    # print 'read {} pages'.format(len(page_labels))
    TP, TN, FP, FN = score_errors(model, page_tuples, page_labels, page_num_labels, False)
    score = score_positive(TP, TN, FP, FN)
    print 'score = {}'.format(score)
    page_total, page_count = get_page_count(page_labels)
    wgt_avg = weighted_average(page_total, page_count, score)
    print 'weighted average = {}'.format(wgt_avg)
コード例 #7
0
ファイル: scorepos.py プロジェクト: vandermude-zz/Portfolio
def main():
    """
    Command Line Inputs:
    Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input Model file with labels and array of weak learners
    Output the matthews correlation coefficient for each label
    """
    model = read_model(sys.argv[2])
    # print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
    page_tuples, page_labels, page_num_labels = read_data_labels(
        sys.argv[1], model)
    # print 'read {} pages'.format(len(page_labels))
    TP, TN, FP, FN = score_errors(model, page_tuples, page_labels,
                                  page_num_labels, True, TVDBG_P, TVDBG_L)
    matthews = score_matthews(TP, TN, FP, FN, TVDBG_P, TVDBG_L)
    print 'score = {}'.format(matthews)
    page_total, page_count = get_page_count(page_labels)
    wgt_avg = weighted_average(page_total, page_count, matthews)
    print 'weighted average = {}'.format(wgt_avg)
コード例 #8
0
ファイル: normalize.py プロジェクト: vandermude/Portfolio
def evaluate(model, page_tuples, page_labels, page_num_labels):
    TP, TN, FP, FN = score_errors(model, page_tuples, page_labels, page_num_labels, True, TVDBG_P, TVDBG_L)
    matthews = score_matthews(TP, TN, FP, FN, TVDBG_P, TVDBG_L)
    page_total, page_count = get_page_count(page_labels)
    wgt_avg = weighted_average(page_total, page_count, matthews)
    print 'weighted average = {} matthews={}'.format(wgt_avg, matthews)