def split_train_test_highest(fname, codeprefixlen, mincodeoccurences, filtered_by): #prepare generators rec_generator = lambda:gen_record(fname, filtered_by) prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator) #generate labels PRINTER('generating labels...') labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) PRINTER('labels generated:') PRINTER(str(labels)) #gen filtered records: labelsset = set(labels) prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset) PRINTER('counting elements...') elements_count = len(list(prefix_code_generator())) PRINTER('number of elements' +str(elements_count)) #split into training and testing samples PRINTER('splitting into training and testing...') train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10)) train_generator = lambda:gen_record_fromshifts(prefix_code_generator, train_inds) test_generator = lambda:gen_record_fromshifts(prefix_code_generator, test_inds) PRINTER('splitted.') elements_count = len(list(prefix_code_generator())) return train_generator, test_generator, elements_count, labels, elements_count
def count_label_statistics(fname, fields): """ Counts the following statistics and prints them. D is the dataset filtered by the condition to contain all of the fields L is number of distinct labels in D. -Label cardinality: the average number of labels of the examples in D -Label density: the average number of labels of the examples in D divided by |L| -Bolo11: the percentage of documents that contain at least 2 labels of the same 2 code prefix -Bolo12: the percentage of documents that contain at least 2 labels of the same 3 code prefix -Bolo21: the percentage of documents that contain at least 2 labels of different 2 code prefix -Bolo22: the percentage of documents that contain at least 2 labels of different 3 code prefix """ all = 0 labels = set() lc = 0 ld = 0 bolo11 = 0 bolo12 = 0 bolo21 = 0 bolo22 = 0 #count statistics for lmc in gen_lmc(lambda: gen_record(fname, fields), fields): all += 1 for mc in lmc: labels.add(mc) lc += len(lmc) ld += len(lmc) if contains2of_same_prefix(lmc, 2): bolo11 += 1 if contains2of_same_prefix(lmc, 3): bolo12 += 1 if contains2of_diff_prefix(lmc, 2): bolo21 += 1 if contains2of_diff_prefix(lmc, 3): bolo22 += 1 print '[count_label_statistics]: lmc:', lmc, contains2of_same_prefix( lmc, 2), contains2of_same_prefix(lmc, 3), contains2of_diff_prefix( lmc, 2), contains2of_diff_prefix(lmc, 3) #print statistics print "lc:", lc / all print "ld:", ld / (all * len(labels)) print "bolo11 contain at least 2 of same 2 code prefix:", bolo11 / all print "bolo12 contain at least 2 of same 3 code prefix:", bolo12 / all print "bolo21 contain at least 2 of diff 2 code prefix:", bolo21 / all print "bolo22 contain at least 2 of diff 3 code prefix:", bolo22 / all
def count_label_statistics(fname, fields): """ Counts the following statistics and prints them. D is the dataset filtered by the condition to contain all of the fields L is number of distinct labels in D. -Label cardinality: the average number of labels of the examples in D -Label density: the average number of labels of the examples in D divided by |L| -Bolo11: the percentage of documents that contain at least 2 labels of the same 2 code prefix -Bolo12: the percentage of documents that contain at least 2 labels of the same 3 code prefix -Bolo21: the percentage of documents that contain at least 2 labels of different 2 code prefix -Bolo22: the percentage of documents that contain at least 2 labels of different 3 code prefix """ all = 0 labels = set() lc = 0 ld = 0 bolo11 = 0 bolo12 = 0 bolo21 = 0 bolo22 = 0 #count statistics for lmc in gen_lmc(lambda: gen_record(fname, fields), fields): all+=1 for mc in lmc: labels.add(mc) lc += len(lmc) ld += len(lmc) if contains2of_same_prefix(lmc, 2): bolo11 += 1 if contains2of_same_prefix(lmc, 3): bolo12 += 1 if contains2of_diff_prefix(lmc, 2): bolo21 += 1 if contains2of_diff_prefix(lmc, 3): bolo22 += 1 print '[count_label_statistics]: lmc:', lmc, contains2of_same_prefix(lmc, 2), contains2of_same_prefix(lmc, 3), contains2of_diff_prefix(lmc, 2), contains2of_diff_prefix(lmc, 3) #print statistics print "lc:", lc/all print "ld:", ld/(all*len(labels)) print "bolo11 contain at least 2 of same 2 code prefix:", bolo11/all print "bolo12 contain at least 2 of same 3 code prefix:", bolo12/all print "bolo21 contain at least 2 of diff 2 code prefix:", bolo21/all print "bolo22 contain at least 2 of diff 3 code prefix:", bolo22/all
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences, filtered_by): #prepare generators rec_generator = lambda:gen_record(fname, filtered_by) prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator) #generate labels PRINTER('generating labels...') labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) PRINTER('labels generated:') PRINTER(str(labels)) #gen filtered records: labelsset = set(labels) prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset) PRINTER('counting elements...') elements_count = len(list(prefix_code_generator())) PRINTER('number of elements' +str(elements_count)) return labels, labelsset, prefix_code_generator, elements_count
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences, filtered_by): #prepare generators rec_generator = lambda: gen_record(fname, filtered_by) prefixed_rec_generator = lambda: gen_record_prefixed( rec_generator, codeprefixlen) prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator) #generate labels PRINTER('generating labels...') labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) PRINTER('labels generated:') PRINTER(str(labels)) #gen filtered records: labelsset = set(labels) prefix_code_generator = lambda: gen_record_filteredbylabels( prefixed_rec_generator, labelsset) PRINTER('counting elements...') elements_count = len(list(prefix_code_generator())) PRINTER('number of elements' + str(elements_count)) return labels, labelsset, prefix_code_generator, elements_count
print "k:", k print "smoothingparam:", smoothingparam print "distancetrainingsteps:", distancetrainingsteps print "filtered_by:", filtered_by print "save_hierarchical_path:", save_hierarchical_path print "save_train_generator_path:", save_train_generator_path print "save_lenlabels_path:", save_lenlabels_path log_level = logging.INFO logging.basicConfig(level=log_level) #prepare generators rec_generator = lambda: gen_record(fname, filtered_by) prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator) #generate labels print "generating labels..." labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) labelsset = set(labels) print "labels generated." print labels #gen filtered records: prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset) print "counting elements..." elements_count = len(list(prefix_code_generator())) print "number of elements:", elements_count #split into training and testing samples
from tools.pickle_tools import read_pickle train_generator_list = read_pickle(load_train_generator_path) PRINTER('Loading labels path and elements count...') lenlabels = len(read_pickle(load_labels_path)) elements_count = read_pickle(load_elements_count_path) PRINTER("training distance...") train_generator = lambda: train_generator_list if distancetype=='jac': from mlknn.jaccard_distance import JaccardDistance zbldistance = JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps) else: from mlknn.txt_cosine_distance import TxtCosineDistance zbldistance = TxtCosineDistance(distancetype) PRINTER("Finding label list...") get_labels_of_record = mc2lmc_tomka_blad find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1) PRINTER("Training MLKNN...") from time import time start = time() mlknn_single = MlKnnFractional(train_generator, zbldistance, find_closest_points, k, get_labels_of_record) PRINTER("Time taken for training:"+str(start-time())) from tools.pickle_tools import save_pickle PRINTER("MLKNN: pickling the classifier...") save_pickle(mlknn_single, save_classifier_path)
try: filtered_by = sys.argv[7:] except: print '7th argument: list of the fields to exist in considered records.' sys.exit(1) #prepare generators rec_generator_first = lambda: gen_record(fname, filtered_by) #choosing shuffling_cnt elements in random: PRINTER("shuffling in random") import random chosen_records = random.sample(list(rec_generator_first()), shuffling_cnt) rec_generator = lambda: chosen_records prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator) #generate labels PRINTER("generating labels...") labels_counts = get_labels_counts(prefix_code_generator, mincodeoccurences) #PRINTER("labels generated." #PRINTER(sorted(labels_counts, key = lambda x: x[1], reverse = True) biggest_labels = map(lambda x: x[0], sorted(labels_counts, key = lambda x: x[1], reverse = True))[:biggest_labels_cnt] labelsset = set(biggest_labels) PRINTER(biggest_labels) #gen filtered records: prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset) PRINTER("counting elements...") elements_count = len(list(prefix_code_generator()))
except: print '8th argument expected: load_test_generator parameter' sys.exit(1) PRINTER('Loading training list...') from tools.pickle_tools import read_pickle train_generator_list = read_pickle(load_train_generator_path) PRINTER('Loading labels path and elements count...') lenlabels = len(read_pickle(load_labels_path)) elements_count = read_pickle(load_elements_count_path) PRINTER("Finding label list...") get_labels_of_record = mc2lmc_tomka_blad find_all_labels = lambda frecords: get_labels_min_occurence( lambda: gen_lmc(frecords), 1) PRINTER("Loading distance matrix...") import sys sys.path.append(r'../') from data_io.matrix_io import fread_smatrix (rows, cols, data) = fread_smatrix(distancematrix) id2rowind, id2colind = {}, {} for ind, id in enumerate(rows): id2rowind[id] = ind for ind, id in enumerate(cols): id2colind[id] = ind print "len(train_generator_list):", len(train_generator_list) print "len(rows):", len(rows) #print "(rows, cols, data):", (rows, cols, data)
def main(train_generator_list, labels, elements_count, classifier_name, k, smoothing_param, distancematrix, test_generator): PRINTER("Finding label list...") get_labels_of_record = mc2lmc_tomka_blad find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1) PRINTER("Loading distance matrix...") import sys sys.path.append(r'../') from data_io.matrix_io import fread_smatrix (rows, cols, data) = fread_smatrix(distancematrix) id2rowind, id2colind = {}, {} for ind, id in enumerate(rows): id2rowind[id] = ind for ind, id in enumerate(cols): id2colind[id] = ind #print "len(train_generator_list):",len(train_generator_list) #print "len(test_generator_list):",len(test_generator) #print "len(rows):",len(rows) #print "(rows, cols, data):", (rows, cols, data) PRINTER("Training classifier...") from time import time def printer(x): #import logging logging.info('['+classifier_name+']'+x) def distance(a, b): try: return data[id2rowind[a['an']]][id2colind[b['an']]] except: return data[id2colind[b['an']]][id2rowind[a['an']]] start = time() if classifier_name=='mlknn_basic': def get_neighbours(sample, k): return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance) k = int(k) from mlknn import mlknn_basic classifier = mlknn_basic.MlknnBasic(train_generator_list, get_neighbours, k, smoothing_param, get_labels_of_record, lambda x:1, printer) elif classifier_name == 'mlknn_threshold': def get_neighbours(sample, k): return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance) k = int(k) from mlknn import mlknn_threshold classifier = mlknn_threshold.MlknnThreshold(train_generator_list, get_neighbours, k, smoothing_param, get_labels_of_record, lambda x:1, printer) elif classifier_name == 'mlknn_tensembled': def get_neighbours(sample, k): return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance) k = map(int, k.strip().split(',')) PRINTER("loaded k-list: "+str(k)) from mlknn import mlknn_tensembled classifier = mlknn_tensembled.MlknnTEnsembled(train_generator_list, get_neighbours, k, get_labels_of_record, lambda x:1, printer) elif classifier_name=='mlknn-basic-tree': def get_neighbours(sample, k, train_gen): return find_closest_points_sorted(sample, train_gen, [sample], k, distance) k = int(k) from mlknn import mlknn_basic mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_basic.MlknnBasic(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) from mltools.ml_hierarchical import MlHierarchical classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record) elif classifier_name == 'mlknn-threshold-tree': def get_neighbours(sample, k, train_gen): return find_closest_points_sorted(sample, train_gen, [sample], k, distance) k = int(k) from mlknn import mlknn_threshold mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_threshold.MlknnThreshold(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) from mltools.ml_hierarchical import MlHierarchical classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record) elif classifier_name == 'mlknn-tensembled-tree': def get_neighbours(sample, k, train_gen): return find_closest_points_sorted(sample, train_gen, [sample], k, distance) k = map(int, k.strip().split(',')) PRINTER("loaded k-list: "+str(k)) from mlknn import mlknn_tensembled mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_tensembled.MlknnTEnsembled(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), k, get_labels_of_record_arg, lambda x:1, printer) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) from mltools.ml_hierarchical import MlHierarchical classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record) PRINTER("Time taken for training:"+str(start-time())) PRINTER("------------------------") PRINTER("---Testing classifier---") PRINTER("------------------------") classify_oracle = mc2lmc_tomka_blad from mltools.multilabel_evaluate import multilabel_evaluate, multilabel_evaluate_printresults accuracy, precision, recall, hammingloss, subset01loss, fmeasure = multilabel_evaluate(lambda: test_generator, classify_oracle, classifier.__getattribute__('classify'), len(labels), [('full label', lambda x: x), ('half label', lambda x: x[:3]), ('low label', lambda x: x[:2])]) PRINTER("-----------RESULTS-----------") multilabel_evaluate_printresults(accuracy, precision, recall, hammingloss, subset01loss, fmeasure, PRINTER) return accuracy, precision, recall, hammingloss, subset01loss, fmeasure
PRINTER('Loading labels path and elements count...') lenlabels = len(read_pickle(load_labels_path)) elements_count = read_pickle(load_elements_count_path) PRINTER("training distance...") train_generator = lambda: train_generator_list if distancetype=='jac': from mlknn.jaccard_distance import JaccardDistance zbldistance = JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps) else: from mlknn.txt_cosine_distance import TxtCosineDistance zbldistance = TxtCosineDistance(distancetype) PRINTER("Finding label list...") get_labels_of_record = mc2lmc_tomka_blad find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1) PRINTER("Training MLKNN...") from time import time start = time() mlknn_single = MlKnn(train_generator, zbldistance, find_closest_points, k, smoothingparam, get_labels_of_record) PRINTER("Time taken for training:"+str(start-time())) #PRINTER("MLKNN: training thresholds on validation set...") #start = time() #mlknn_adjust_thresholds(mlknn_single, validate_generator_list, classify_oracle = get_labels_of_record) #PRINTER("Time taken for training thresholds:"+str(start-time())) from tools.pickle_tools import save_pickle PRINTER("MLKNN: pickling the classifier...")