def split_train_test_highest(fname, codeprefixlen, mincodeoccurences, filtered_by): #prepare generators rec_generator = lambda:gen_record(fname, filtered_by) prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator) #generate labels PRINTER('generating labels...') labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) PRINTER('labels generated:') PRINTER(str(labels)) #gen filtered records: labelsset = set(labels) prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset) PRINTER('counting elements...') elements_count = len(list(prefix_code_generator())) PRINTER('number of elements' +str(elements_count)) #split into training and testing samples PRINTER('splitting into training and testing...') train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10)) train_generator = lambda:gen_record_fromshifts(prefix_code_generator, train_inds) test_generator = lambda:gen_record_fromshifts(prefix_code_generator, test_inds) PRINTER('splitted.') elements_count = len(list(prefix_code_generator())) return train_generator, test_generator, elements_count, labels, elements_count
def words_freqs(records, features): #count each word in abstract and title: all_w = 0 words = defaultdict(lambda: 0) for rec in gen_record(records, features+['mc']): words_l = text_to_words(" ".join([rec[f] for f in features]) ) for c in words_l: words[c]+=1 all_w+=1 print "count of all words:", all_w print "words found:" w_sorted = sorted(list(words.iteritems()), key=lambda x:x[1], reverse=True) for k, v in w_sorted: print k, ":", v
def count_label_statistics(fname, fields): """ Counts the following statistics and prints them. D is the dataset filtered by the condition to contain all of the fields L is number of distinct labels in D. -Label cardinality: the average number of labels of the examples in D -Label density: the average number of labels of the examples in D divided by |L| -Bolo11: the percentage of documents that contain at least 2 labels of the same 2 code prefix -Bolo12: the percentage of documents that contain at least 2 labels of the same 3 code prefix -Bolo21: the percentage of documents that contain at least 2 labels of different 2 code prefix -Bolo22: the percentage of documents that contain at least 2 labels of different 3 code prefix """ all = 0 labels = set() lc = 0 ld = 0 bolo11 = 0 bolo12 = 0 bolo21 = 0 bolo22 = 0 #count statistics for lmc in gen_lmc(lambda: gen_record(fname, fields), fields): all += 1 for mc in lmc: labels.add(mc) lc += len(lmc) ld += len(lmc) if contains2of_same_prefix(lmc, 2): bolo11 += 1 if contains2of_same_prefix(lmc, 3): bolo12 += 1 if contains2of_diff_prefix(lmc, 2): bolo21 += 1 if contains2of_diff_prefix(lmc, 3): bolo22 += 1 print '[count_label_statistics]: lmc:', lmc, contains2of_same_prefix( lmc, 2), contains2of_same_prefix(lmc, 3), contains2of_diff_prefix( lmc, 2), contains2of_diff_prefix(lmc, 3) #print statistics print "lc:", lc / all print "ld:", ld / (all * len(labels)) print "bolo11 contain at least 2 of same 2 code prefix:", bolo11 / all print "bolo12 contain at least 2 of same 3 code prefix:", bolo12 / all print "bolo21 contain at least 2 of diff 2 code prefix:", bolo21 / all print "bolo22 contain at least 2 of diff 3 code prefix:", bolo22 / all
def count_label_statistics(fname, fields): """ Counts the following statistics and prints them. D is the dataset filtered by the condition to contain all of the fields L is number of distinct labels in D. -Label cardinality: the average number of labels of the examples in D -Label density: the average number of labels of the examples in D divided by |L| -Bolo11: the percentage of documents that contain at least 2 labels of the same 2 code prefix -Bolo12: the percentage of documents that contain at least 2 labels of the same 3 code prefix -Bolo21: the percentage of documents that contain at least 2 labels of different 2 code prefix -Bolo22: the percentage of documents that contain at least 2 labels of different 3 code prefix """ all = 0 labels = set() lc = 0 ld = 0 bolo11 = 0 bolo12 = 0 bolo21 = 0 bolo22 = 0 #count statistics for lmc in gen_lmc(lambda: gen_record(fname, fields), fields): all+=1 for mc in lmc: labels.add(mc) lc += len(lmc) ld += len(lmc) if contains2of_same_prefix(lmc, 2): bolo11 += 1 if contains2of_same_prefix(lmc, 3): bolo12 += 1 if contains2of_diff_prefix(lmc, 2): bolo21 += 1 if contains2of_diff_prefix(lmc, 3): bolo22 += 1 print '[count_label_statistics]: lmc:', lmc, contains2of_same_prefix(lmc, 2), contains2of_same_prefix(lmc, 3), contains2of_diff_prefix(lmc, 2), contains2of_diff_prefix(lmc, 3) #print statistics print "lc:", lc/all print "ld:", ld/(all*len(labels)) print "bolo11 contain at least 2 of same 2 code prefix:", bolo11/all print "bolo12 contain at least 2 of same 3 code prefix:", bolo12/all print "bolo21 contain at least 2 of diff 2 code prefix:", bolo21/all print "bolo22 contain at least 2 of diff 3 code prefix:", bolo22/all
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences, filtered_by): #prepare generators rec_generator = lambda:gen_record(fname, filtered_by) prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator) #generate labels PRINTER('generating labels...') labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) PRINTER('labels generated:') PRINTER(str(labels)) #gen filtered records: labelsset = set(labels) prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset) PRINTER('counting elements...') elements_count = len(list(prefix_code_generator())) PRINTER('number of elements' +str(elements_count)) return labels, labelsset, prefix_code_generator, elements_count
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences, filtered_by): #prepare generators rec_generator = lambda: gen_record(fname, filtered_by) prefixed_rec_generator = lambda: gen_record_prefixed( rec_generator, codeprefixlen) prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator) #generate labels PRINTER('generating labels...') labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) PRINTER('labels generated:') PRINTER(str(labels)) #gen filtered records: labelsset = set(labels) prefix_code_generator = lambda: gen_record_filteredbylabels( prefixed_rec_generator, labelsset) PRINTER('counting elements...') elements_count = len(list(prefix_code_generator())) PRINTER('number of elements' + str(elements_count)) return labels, labelsset, prefix_code_generator, elements_count
print "codeprefixlen:", codeprefixlen print "mincodeoccurences", mincodeoccurences print "k:", k print "smoothingparam:", smoothingparam print "distancetrainingsteps:", distancetrainingsteps print "filtered_by:", filtered_by print "save_hierarchical_path:", save_hierarchical_path print "save_train_generator_path:", save_train_generator_path print "save_lenlabels_path:", save_lenlabels_path log_level = logging.INFO logging.basicConfig(level=log_level) #prepare generators rec_generator = lambda: gen_record(fname, filtered_by) prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator) #generate labels print "generating labels..." labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) labelsset = set(labels) print "labels generated." print labels #gen filtered records: prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset) print "counting elements..." elements_count = len(list(prefix_code_generator())) print "number of elements:", elements_count
except: print '5th argument: number of labels to consider' sys.exit(1) try: shuffling_cnt = int(sys.argv[6]) except: print '6th argument: How many records to sample before the further filtering.' sys.exit(1) try: filtered_by = sys.argv[7:] except: print '7th argument: list of the fields to exist in considered records.' sys.exit(1) #prepare generators rec_generator_first = lambda: gen_record(fname, filtered_by) #choosing shuffling_cnt elements in random: PRINTER("shuffling in random") import random chosen_records = random.sample(list(rec_generator_first()), shuffling_cnt) rec_generator = lambda: chosen_records prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator) #generate labels PRINTER("generating labels...") labels_counts = get_labels_counts(prefix_code_generator, mincodeoccurences) #PRINTER("labels generated." #PRINTER(sorted(labels_counts, key = lambda x: x[1], reverse = True) biggest_labels = map(lambda x: x[0], sorted(labels_counts, key = lambda x: x[1],
from zbl2py import record_read from classifier_tester import LeaveOneOutAllCategories from classifier_knn import KnnMatrixClassifier import os #lib_path = os.path.abspath(os.path.sep.join(['..', 'topic_classification'])) #sys.path.append(lib_path) lib_path = os.path.abspath( os.path.sep.join(['..', '..', '..', 'document_classification'])) sys.path.append(lib_path) from data_io.zbl_record_generators import gen_record, mc2lmc_tomka_blad if __name__ == '__main__': records_file = sys.argv[1] test_samples = int(sys.argv[2]) print "Arguments read:" print "records_file =", records_file print "test_samples =", test_samples frecords = lambda: gen_record(records_file, ['mc', 'ti', 'ab', 'au']) #loo = LeaveOneOutAllCategories(KnnMatrixClassifier, frecords, mc2lmc_tomka_blad) #corr = loo.test(test_samples) #print "Correctness:", corr print "---training a classifier..." knn = KnnMatrixClassifier(frecords, 7000, 100, mc2lmc_tomka_blad) print "---performing leave one out..." corr = knn.loo(test_samples) print "COrrectness:", corr
sys.path.append(r'../') from zbl2py import record_read from classifier_tester import LeaveOneOutAllCategories from classifier_knn import KnnMatrixClassifier import os #lib_path = os.path.abspath(os.path.sep.join(['..', 'topic_classification'])) #sys.path.append(lib_path) lib_path = os.path.abspath(os.path.sep.join(['..', '..', '..', 'document_classification'])) sys.path.append(lib_path) from data_io.zbl_record_generators import gen_record, mc2lmc_tomka_blad if __name__ == '__main__': records_file = sys.argv[1] test_samples = int(sys.argv[2]) print "Arguments read:" print "records_file =", records_file print "test_samples =", test_samples frecords = lambda: gen_record(records_file, ['mc', 'ti', 'ab', 'au']) #loo = LeaveOneOutAllCategories(KnnMatrixClassifier, frecords, mc2lmc_tomka_blad) #corr = loo.test(test_samples) #print "Correctness:", corr print "---training a classifier..." knn = KnnMatrixClassifier(frecords, 7000, 100, mc2lmc_tomka_blad) print "---performing leave one out..." corr = knn.loo(test_samples) print "COrrectness:", corr