Esempio n. 1
0
def main():
    """
        Arguments
        argv[1] text to be classified
        argv[2] model
        argv[3] classification result
    """
    if len(sys.argv) != 4:
        print 'Usage: classifier.py data model output'
        sys.exit()
    pages = read_data_text(sys.argv[1])
    model = read_model(sys.argv[2])
    print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
    label_names = [label.encode("utf8") for label in model['labels']]
    num_labels = len(model['labels'])
    print 'read {} pages'.format(len(pages))
    with open(sys.argv[3], 'w') as output_file:
        for page in pages:
            url, text = page.split('\t', 1)
            text = ' '.join(text.split())
            scores = classifier_text(model, text)
            label_scores = [[label_names[l], scores[l]] for l in range(num_labels)]
            # returns an array sorted by value
            sorted_label_scores = sorted(label_scores, key=lambda x: x[1], reverse=True)
            output_file.write('PAGE\t{}\t{}\n'.format(url, sorted_label_scores))
Esempio n. 2
0
def main():
    """
        Arguments
        argv[1] text to be classified
        argv[2] model
        argv[3] classification result
    """
    if len(sys.argv) != 4:
        print 'Usage: classifier.py data model output'
        sys.exit()
    pages = read_data_text(sys.argv[1])
    model = read_model(sys.argv[2])
    print 'model has {} labels and {} tuples'.format(len(model['labels']),
                                                     len(model['learners']))
    label_names = [label.encode("utf8") for label in model['labels']]
    num_labels = len(model['labels'])
    print 'read {} pages'.format(len(pages))
    with open(sys.argv[3], 'w') as output_file:
        for page in pages:
            url, text = page.split('\t', 1)
            text = ' '.join(text.split())
            scores = classifier_text(model, text)
            label_scores = [[label_names[l], scores[l]]
                            for l in range(num_labels)]
            # returns an array sorted by value
            sorted_label_scores = sorted(label_scores,
                                         key=lambda x: x[1],
                                         reverse=True)
            output_file.write('PAGE\t{}\t{}\n'.format(url,
                                                      sorted_label_scores))
Esempio n. 3
0
def main():
    """
    Command Line Inputs:
    Input Model file with labels and array of weak learners
    """
    filename = sys.argv[1]
    print 'open {}'.format(filename)
    model = read_model(filename)
    print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
Esempio n. 4
0
def main():
    """
    Command Line Inputs:
    Input Model file with labels and array of weak learners
    """
    filename = sys.argv[1]
    print 'open {}'.format(filename)
    model = read_model(filename)
    print 'model has {} labels and {} tuples'.format(len(model['labels']),
                                                     len(model['learners']))
Esempio n. 5
0
def main():
    """
    Command Line Inputs:
    Input: Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input: Model file with labels and array of weak learners
    Output: New Model file with labels and array of weak learners with naive bayesian weights
    """
    pages, labels, label_text, tuples, tuples_selected, tuple_text = read_data_tuples(
        sys.argv[1])
    model = read_model(sys.argv[2])
    print 'model has {} labels and {} tuples'.format(len(model['labels']),
                                                     len(model['learners']))
    naivebayes(model, labels, tuples, tuples_selected, label_text, tuple_text)
    write_model(sys.argv[3], model)
Esempio n. 6
0
def main():
    """
    Command Line Inputs:
    Input: Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input: Model file with labels and array of weak learners
    Output: New Model file with labels and array of weak learners with normalized weights
    """
    pages, labels, label_text, tuples, tuples_selected, tuple_text = read_data_tuples(sys.argv[1])
    model = read_model(sys.argv[2])
    print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
    # for l in range(len(labels)):
        # print '[{}]={}: [{}]'.format(l, label_text[l], \
        #    ' '.join(str(labels[l][p]) for p in range(len(labels[0]))))
    cntnorm(model, labels, tuples, label_text, tuple_text)
    write_model(sys.argv[3], model)
Esempio n. 7
0
def main():
    """
    Command Line Inputs:
    Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input Model file with labels and array of weak learners
    Output Model file with correction factors
    Print the matthews correlation coefficient for each label
    """
    model = read_model(sys.argv[2])
    print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
    page_tuples, page_labels, page_num_labels = read_data_labels(sys.argv[1], model)
    print 'read {} pages'.format(len(page_labels))
    evaluate(model, page_tuples, page_labels, page_num_labels)
    model = normalize(model, page_tuples, page_labels, page_num_labels)
    write_model(sys.argv[3], model)
    evaluate(model, page_tuples, page_labels, page_num_labels)
Esempio n. 8
0
def main():
    """
    Command Line Inputs:
    Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input Model file with labels and array of weak learners
    Output the positive score TP / (TP + FP) for each label
    """
    model = read_model(sys.argv[2])
    # print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
    page_tuples, page_labels, page_num_labels = read_data_labels(sys.argv[1], model)
    # print 'read {} pages'.format(len(page_labels))
    TP, TN, FP, FN = score_errors(model, page_tuples, page_labels, page_num_labels, False, TVDBG_P, TVDBG_L)
    score = score_positive(TP, TN, FP, FN, TVDBG_P, TVDBG_L)
    print 'score = {}'.format(score)
    page_total, page_count = get_page_count(page_labels)
    wgt_avg = weighted_average(page_total, page_count, score)
    print 'weighted average = {}'.format(wgt_avg)
Esempio n. 9
0
def main():
    """
    Command Line Inputs:
    Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input Model file with labels and array of weak learners
    Output the matthews correlation coefficient for each label
    """
    model = read_model(sys.argv[2])
    # print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
    page_tuples, page_labels, page_num_labels = read_data_labels(sys.argv[1], model)
    # print 'read {} pages'.format(len(page_labels))
    TP, TN, FP, FN = score_errors(model, page_tuples, page_labels, page_num_labels, False)
    matthews = score_matthews(TP, TN, FP, FN)
    print 'score = {}'.format(matthews)
    page_total, page_count = get_page_count(page_labels)
    wgt_avg = weighted_average(page_total, page_count, matthews)
    print 'weighted average = {}'.format(wgt_avg)
Esempio n. 10
0
def main():
    """
    Command Line Inputs:
    Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input Model file with labels and array of weak learners
    Output the positive score TP / (TP + FP) for each label
    """
    model = read_model(sys.argv[2])
    # print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
    page_tuples, page_labels, page_num_labels = read_data_labels(sys.argv[1], model)
    # print 'read {} pages'.format(len(page_labels))
    TP, TN, FP, FN = score_errors(model, page_tuples, page_labels, page_num_labels, False)
    score = score_positive(TP, TN, FP, FN)
    print 'score = {}'.format(score)
    page_total, page_count = get_page_count(page_labels)
    wgt_avg = weighted_average(page_total, page_count, score)
    print 'weighted average = {}'.format(wgt_avg)
Esempio n. 11
0
def main():
    """
    Command Line Inputs:
    Input: Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input: Model file with labels and array of weak learners
    Output: New Model file with labels and array of weak learners with normalized weights
    """
    pages, labels, label_text, tuples, tuples_selected, tuple_text = read_data_tuples(
        sys.argv[1])
    model = read_model(sys.argv[2])
    print 'model has {} labels and {} tuples'.format(len(model['labels']),
                                                     len(model['learners']))
    # for l in range(len(labels)):
    # print '[{}]={}: [{}]'.format(l, label_text[l], \
    #    ' '.join(str(labels[l][p]) for p in range(len(labels[0]))))
    cntnorm(model, labels, tuples, label_text, tuple_text)
    write_model(sys.argv[3], model)
Esempio n. 12
0
def main():
    """
    Command Line Inputs:
    Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input Model file with labels and array of weak learners
    Output Model file with correction factors
    Print the matthews correlation coefficient for each label
    """
    model = read_model(sys.argv[2])
    print 'model has {} labels and {} tuples'.format(len(model['labels']),
                                                     len(model['learners']))
    page_tuples, page_labels, page_num_labels = read_data_labels(
        sys.argv[1], model)
    print 'read {} pages'.format(len(page_labels))
    evaluate(model, page_tuples, page_labels, page_num_labels)
    model = normalize(model, page_tuples, page_labels, page_num_labels)
    write_model(sys.argv[3], model)
    evaluate(model, page_tuples, page_labels, page_num_labels)
Esempio n. 13
0
def main():
    """
    Command Line Inputs:
    Data file with lines: PAGE	URL	LABEL_1	LABEL_2	...	LABEL_n and associated text for page
    Input Model file with labels and array of weak learners
    Output the matthews correlation coefficient for each label
    """
    model = read_model(sys.argv[2])
    # print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
    page_tuples, page_labels, page_num_labels = read_data_labels(
        sys.argv[1], model)
    # print 'read {} pages'.format(len(page_labels))
    TP, TN, FP, FN = score_errors(model, page_tuples, page_labels,
                                  page_num_labels, True, TVDBG_P, TVDBG_L)
    matthews = score_matthews(TP, TN, FP, FN, TVDBG_P, TVDBG_L)
    print 'score = {}'.format(matthews)
    page_total, page_count = get_page_count(page_labels)
    wgt_avg = weighted_average(page_total, page_count, matthews)
    print 'weighted average = {}'.format(wgt_avg)