def main(): """ Arguments argv[1] text to be classified argv[2] model argv[3] classification result """ if len(sys.argv) != 4: print 'Usage: classifier.py data model output' sys.exit() pages = read_data_text(sys.argv[1]) model = read_model(sys.argv[2]) print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners'])) label_names = [label.encode("utf8") for label in model['labels']] num_labels = len(model['labels']) print 'read {} pages'.format(len(pages)) with open(sys.argv[3], 'w') as output_file: for page in pages: url, text = page.split('\t', 1) text = ' '.join(text.split()) scores = classifier_text(model, text) label_scores = [[label_names[l], scores[l]] for l in range(num_labels)] # returns an array sorted by value sorted_label_scores = sorted(label_scores, key=lambda x: x[1], reverse=True) output_file.write('PAGE\t{}\t{}\n'.format(url, sorted_label_scores))
def main(): """ Command Line Inputs: Input Model file with labels and array of weak learners """ filename = sys.argv[1] print 'open {}'.format(filename) model = read_model(filename) print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners']))
def main(): """ Command Line Inputs: Input: Data file with lines: PAGE URL LABEL_1 LABEL_2 ... LABEL_n and associated text for page Input: Model file with labels and array of weak learners Output: New Model file with labels and array of weak learners with naive bayesian weights """ pages, labels, label_text, tuples, tuples_selected, tuple_text = read_data_tuples( sys.argv[1]) model = read_model(sys.argv[2]) print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners'])) naivebayes(model, labels, tuples, tuples_selected, label_text, tuple_text) write_model(sys.argv[3], model)
def main(): """ Command Line Inputs: Input: Data file with lines: PAGE URL LABEL_1 LABEL_2 ... LABEL_n and associated text for page Input: Model file with labels and array of weak learners Output: New Model file with labels and array of weak learners with normalized weights """ pages, labels, label_text, tuples, tuples_selected, tuple_text = read_data_tuples(sys.argv[1]) model = read_model(sys.argv[2]) print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners'])) # for l in range(len(labels)): # print '[{}]={}: [{}]'.format(l, label_text[l], \ # ' '.join(str(labels[l][p]) for p in range(len(labels[0])))) cntnorm(model, labels, tuples, label_text, tuple_text) write_model(sys.argv[3], model)
def main(): """ Command Line Inputs: Data file with lines: PAGE URL LABEL_1 LABEL_2 ... LABEL_n and associated text for page Input Model file with labels and array of weak learners Output Model file with correction factors Print the matthews correlation coefficient for each label """ model = read_model(sys.argv[2]) print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners'])) page_tuples, page_labels, page_num_labels = read_data_labels(sys.argv[1], model) print 'read {} pages'.format(len(page_labels)) evaluate(model, page_tuples, page_labels, page_num_labels) model = normalize(model, page_tuples, page_labels, page_num_labels) write_model(sys.argv[3], model) evaluate(model, page_tuples, page_labels, page_num_labels)
def main(): """ Command Line Inputs: Data file with lines: PAGE URL LABEL_1 LABEL_2 ... LABEL_n and associated text for page Input Model file with labels and array of weak learners Output the positive score TP / (TP + FP) for each label """ model = read_model(sys.argv[2]) # print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners'])) page_tuples, page_labels, page_num_labels = read_data_labels(sys.argv[1], model) # print 'read {} pages'.format(len(page_labels)) TP, TN, FP, FN = score_errors(model, page_tuples, page_labels, page_num_labels, False, TVDBG_P, TVDBG_L) score = score_positive(TP, TN, FP, FN, TVDBG_P, TVDBG_L) print 'score = {}'.format(score) page_total, page_count = get_page_count(page_labels) wgt_avg = weighted_average(page_total, page_count, score) print 'weighted average = {}'.format(wgt_avg)
def main(): """ Command Line Inputs: Data file with lines: PAGE URL LABEL_1 LABEL_2 ... LABEL_n and associated text for page Input Model file with labels and array of weak learners Output the matthews correlation coefficient for each label """ model = read_model(sys.argv[2]) # print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners'])) page_tuples, page_labels, page_num_labels = read_data_labels(sys.argv[1], model) # print 'read {} pages'.format(len(page_labels)) TP, TN, FP, FN = score_errors(model, page_tuples, page_labels, page_num_labels, False) matthews = score_matthews(TP, TN, FP, FN) print 'score = {}'.format(matthews) page_total, page_count = get_page_count(page_labels) wgt_avg = weighted_average(page_total, page_count, matthews) print 'weighted average = {}'.format(wgt_avg)
def main(): """ Command Line Inputs: Data file with lines: PAGE URL LABEL_1 LABEL_2 ... LABEL_n and associated text for page Input Model file with labels and array of weak learners Output the positive score TP / (TP + FP) for each label """ model = read_model(sys.argv[2]) # print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners'])) page_tuples, page_labels, page_num_labels = read_data_labels(sys.argv[1], model) # print 'read {} pages'.format(len(page_labels)) TP, TN, FP, FN = score_errors(model, page_tuples, page_labels, page_num_labels, False) score = score_positive(TP, TN, FP, FN) print 'score = {}'.format(score) page_total, page_count = get_page_count(page_labels) wgt_avg = weighted_average(page_total, page_count, score) print 'weighted average = {}'.format(wgt_avg)
def main(): """ Command Line Inputs: Input: Data file with lines: PAGE URL LABEL_1 LABEL_2 ... LABEL_n and associated text for page Input: Model file with labels and array of weak learners Output: New Model file with labels and array of weak learners with normalized weights """ pages, labels, label_text, tuples, tuples_selected, tuple_text = read_data_tuples( sys.argv[1]) model = read_model(sys.argv[2]) print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners'])) # for l in range(len(labels)): # print '[{}]={}: [{}]'.format(l, label_text[l], \ # ' '.join(str(labels[l][p]) for p in range(len(labels[0])))) cntnorm(model, labels, tuples, label_text, tuple_text) write_model(sys.argv[3], model)
def main(): """ Command Line Inputs: Data file with lines: PAGE URL LABEL_1 LABEL_2 ... LABEL_n and associated text for page Input Model file with labels and array of weak learners Output Model file with correction factors Print the matthews correlation coefficient for each label """ model = read_model(sys.argv[2]) print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners'])) page_tuples, page_labels, page_num_labels = read_data_labels( sys.argv[1], model) print 'read {} pages'.format(len(page_labels)) evaluate(model, page_tuples, page_labels, page_num_labels) model = normalize(model, page_tuples, page_labels, page_num_labels) write_model(sys.argv[3], model) evaluate(model, page_tuples, page_labels, page_num_labels)
def main(): """ Command Line Inputs: Data file with lines: PAGE URL LABEL_1 LABEL_2 ... LABEL_n and associated text for page Input Model file with labels and array of weak learners Output the matthews correlation coefficient for each label """ model = read_model(sys.argv[2]) # print 'model has {} labels and {} tuples'.format(len(model['labels']), len(model['learners'])) page_tuples, page_labels, page_num_labels = read_data_labels( sys.argv[1], model) # print 'read {} pages'.format(len(page_labels)) TP, TN, FP, FN = score_errors(model, page_tuples, page_labels, page_num_labels, True, TVDBG_P, TVDBG_L) matthews = score_matthews(TP, TN, FP, FN, TVDBG_P, TVDBG_L) print 'score = {}'.format(matthews) page_total, page_count = get_page_count(page_labels) wgt_avg = weighted_average(page_total, page_count, matthews) print 'weighted average = {}'.format(wgt_avg)