def __cpp_sim_matrix_l_generation_routine__(sim_matrix_path, mscmodel, msc2ix, secondary_codes_weight, docid2seccodes): #dstmatrixpath = TMPDIR+"/mlevel_similarity_matrix_"+similarity_aggregation_method_l+"_"+base64.b16encode(aux.quick_md5(sim_matrix_path+similarity_aggregation_method_l+str(MIN_COUNT_MSCPRIM))) dstmatrixpath = sim_matrix_path + ".msc" + str( MIN_COUNT_MSCPRIM) + "_" + similarity_aggregation_method_l if not aux.exists(dstmatrixpath): msc2wids_list = get_msc2wids_list(msc2ix, mscmodel, secondary_codes_weight, docid2seccodes) cpp_wrapper.aggregate_simmatrix(sim_matrix_path, dstmatrixpath, msc2wids_list, method=similarity_aggregation_method_l) logging.info("[build_msc_tree] Loading simmatrix from: " + str(dstmatrixpath)) (rows, cols, sim_matrix_l) = matrix_io.fread_smatrix(dstmatrixpath) ____validate_cpp_output____(msc2ix, rows) return sim_matrix_l
train_generator_list = read_pickle(load_train_generator_path) PRINTER('Loading labels path and elements count...') lenlabels = len(read_pickle(load_labels_path)) elements_count = read_pickle(load_elements_count_path) PRINTER("Finding label list...") get_labels_of_record = mc2lmc_tomka_blad find_all_labels = lambda frecords: get_labels_min_occurence( lambda: gen_lmc(frecords), 1) PRINTER("Loading distance matrix...") import sys sys.path.append(r'../') from data_io.matrix_io import fread_smatrix (rows, cols, data) = fread_smatrix(distancematrix) id2rowind, id2colind = {}, {} for ind, id in enumerate(rows): id2rowind[id] = ind for ind, id in enumerate(cols): id2colind[id] = ind print "len(train_generator_list):", len(train_generator_list) print "len(rows):", len(rows) #print "(rows, cols, data):", (rows, cols, data) PRINTER("Training classifier...") from time import time def printer(x): #import logging
def main(train_generator_list, labels, elements_count, classifier_name, k, smoothing_param, distancematrix, test_generator): PRINTER("Finding label list...") get_labels_of_record = mc2lmc_tomka_blad find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1) PRINTER("Loading distance matrix...") import sys sys.path.append(r'../') from data_io.matrix_io import fread_smatrix (rows, cols, data) = fread_smatrix(distancematrix) id2rowind, id2colind = {}, {} for ind, id in enumerate(rows): id2rowind[id] = ind for ind, id in enumerate(cols): id2colind[id] = ind #print "len(train_generator_list):",len(train_generator_list) #print "len(test_generator_list):",len(test_generator) #print "len(rows):",len(rows) #print "(rows, cols, data):", (rows, cols, data) PRINTER("Training classifier...") from time import time def printer(x): #import logging logging.info('['+classifier_name+']'+x) def distance(a, b): try: return data[id2rowind[a['an']]][id2colind[b['an']]] except: return data[id2colind[b['an']]][id2rowind[a['an']]] start = time() if classifier_name=='mlknn_basic': def get_neighbours(sample, k): return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance) k = int(k) from mlknn import mlknn_basic classifier = mlknn_basic.MlknnBasic(train_generator_list, get_neighbours, k, smoothing_param, get_labels_of_record, lambda x:1, printer) elif classifier_name == 'mlknn_threshold': def get_neighbours(sample, k): return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance) k = int(k) from mlknn import mlknn_threshold classifier = mlknn_threshold.MlknnThreshold(train_generator_list, get_neighbours, k, smoothing_param, get_labels_of_record, lambda x:1, printer) elif classifier_name == 'mlknn_tensembled': def get_neighbours(sample, k): return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance) k = map(int, k.strip().split(',')) PRINTER("loaded k-list: "+str(k)) from mlknn import mlknn_tensembled classifier = mlknn_tensembled.MlknnTEnsembled(train_generator_list, get_neighbours, k, get_labels_of_record, lambda x:1, printer) elif classifier_name=='mlknn-basic-tree': def get_neighbours(sample, k, train_gen): return find_closest_points_sorted(sample, train_gen, [sample], k, distance) k = int(k) from mlknn import mlknn_basic mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_basic.MlknnBasic(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) from mltools.ml_hierarchical import MlHierarchical classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record) elif classifier_name == 'mlknn-threshold-tree': def get_neighbours(sample, k, train_gen): return find_closest_points_sorted(sample, train_gen, [sample], k, distance) k = int(k) from mlknn import mlknn_threshold mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_threshold.MlknnThreshold(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) from mltools.ml_hierarchical import MlHierarchical classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record) elif classifier_name == 'mlknn-tensembled-tree': def get_neighbours(sample, k, train_gen): return find_closest_points_sorted(sample, train_gen, [sample], k, distance) k = map(int, k.strip().split(',')) PRINTER("loaded k-list: "+str(k)) from mlknn import mlknn_tensembled mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_tensembled.MlknnTEnsembled(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), k, get_labels_of_record_arg, lambda x:1, printer) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) from mltools.ml_hierarchical import MlHierarchical classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record) PRINTER("Time taken for training:"+str(start-time())) PRINTER("------------------------") PRINTER("---Testing classifier---") PRINTER("------------------------") classify_oracle = mc2lmc_tomka_blad from mltools.multilabel_evaluate import multilabel_evaluate, multilabel_evaluate_printresults accuracy, precision, recall, hammingloss, subset01loss, fmeasure = multilabel_evaluate(lambda: test_generator, classify_oracle, classifier.__getattribute__('classify'), len(labels), [('full label', lambda x: x), ('half label', lambda x: x[:3]), ('low label', lambda x: x[:2])]) PRINTER("-----------RESULTS-----------") multilabel_evaluate_printresults(accuracy, precision, recall, hammingloss, subset01loss, fmeasure, PRINTER) return accuracy, precision, recall, hammingloss, subset01loss, fmeasure
k2avgsil = silhouettes(simmatrix, possible_k, clustering_method) return max((v, k) for k, v in k2avgsil.iteritems())[1] if __name__ == "__main__": import doctest doctest.testmod() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) try: simmatrix_path = sys.argv[1] except: print "Argument expected: similarity matrix path" sys.exit(-1) print simmatrix_path from data_io import matrix_io (rows, cols, simmatrix) = matrix_io.fread_smatrix( simmatrix_path) #, datareader=matrix_io.__read_ftabs__, maxrows=1000 print "matrix size=", len(simmatrix), "x", len(simmatrix[0]) print simmatrix[0][:10] print simmatrix[1][:10] print simmatrix[2][:10] print simmatrix[3][:10] print simmatrix[4][:10] print "Selected k=", number_of_clusters(simmatrix, xrange(2, len(simmatrix)))
from __future__ import division from itertools import izip import sys sys.path.append(r'../') from data_io.matrix_io import fread_smatrix filename = sys.argv[1] (rows, cols, data) = fread_smatrix(filename) print "(rows, cols, data):", (rows, cols, data)