Beispiel #1
0
        k = map(int, k.strip().split(','))
        PRINTER("loaded k-list: " + str(k))
        from mlknn import mlknn_tensembled
        mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_tensembled.MlknnTEnsembled(
            train_gen, lambda sample, k: get_neighbours(sample, k, train_gen),
            k, get_labels_of_record_arg, lambda x: 1, printer)

    label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)

    from mltools.ml_hierarchical import MlHierarchical
    classifier = MlHierarchical(train_generator_list, mlknn_callable,
                                label_mappings, get_labels_of_record)

    PRINTER("Time taken for training:" + str(start - time()))

    PRINTER("------------------------")
    PRINTER("---Testing classifier---")
    PRINTER("------------------------")
    test_generator = read_pickle(load_test_generator)
    labels = read_pickle(load_labels_path)

    classify_oracle = mc2lmc_tomka_blad
    from mltools.multilabel_evaluate import multilabel_evaluate_printresults
    PRINTER("-----------RESULTS-----------")
    multilabel_evaluate_printresults(lambda: test_generator, classify_oracle,
                                     classifier.__getattribute__('classify'),
                                     len(labels),
                                     [('full label', lambda x: x),
                                      ('half label', lambda x: x[:3]),
                                      ('low label', lambda x: x[:2])], labels)
Beispiel #2
0
def main(train_generator_list, labels, elements_count, classifier_name, k, smoothing_param, distancematrix, test_generator):
    PRINTER("Finding label list...")
    get_labels_of_record = mc2lmc_tomka_blad
    find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1)
    
    PRINTER("Loading distance matrix...")
    import sys
    sys.path.append(r'../')
    from data_io.matrix_io import fread_smatrix
    (rows, cols, data) = fread_smatrix(distancematrix)
    id2rowind, id2colind = {}, {}
    for ind, id in enumerate(rows):
        id2rowind[id] = ind
    for ind, id in enumerate(cols):
        id2colind[id] = ind
        
    #print "len(train_generator_list):",len(train_generator_list)
    #print "len(test_generator_list):",len(test_generator)
    #print "len(rows):",len(rows) 
    #print "(rows, cols, data):", (rows, cols, data)
    
    
    PRINTER("Training classifier...")
    from time import time
    
    def printer(x):
        #import logging
        logging.info('['+classifier_name+']'+x)

    def distance(a, b): 
        try:
            return data[id2rowind[a['an']]][id2colind[b['an']]]
        except:
            return data[id2colind[b['an']]][id2rowind[a['an']]]
        
        
    start = time()
    if classifier_name=='mlknn_basic':
        def get_neighbours(sample, k):
            return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance)
        k = int(k)
        from mlknn import mlknn_basic
        classifier = mlknn_basic.MlknnBasic(train_generator_list, get_neighbours, k, smoothing_param, get_labels_of_record, lambda x:1, printer)
    
    elif classifier_name == 'mlknn_threshold':
        def get_neighbours(sample, k):
            return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance)
        k = int(k)
        from mlknn import mlknn_threshold
        classifier = mlknn_threshold.MlknnThreshold(train_generator_list, get_neighbours, k, smoothing_param, get_labels_of_record, lambda x:1, printer)
        
    elif classifier_name == 'mlknn_tensembled':
        def get_neighbours(sample, k):
            return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance)
        k = map(int, k.strip().split(','))
        PRINTER("loaded k-list: "+str(k))
        from mlknn import mlknn_tensembled
        classifier = mlknn_tensembled.MlknnTEnsembled(train_generator_list, get_neighbours, k, get_labels_of_record, lambda x:1, printer)
    
    elif classifier_name=='mlknn-basic-tree':
        def get_neighbours(sample, k, train_gen):
            return find_closest_points_sorted(sample, train_gen, [sample], k, distance)
        k = int(k)
        from mlknn import mlknn_basic
        mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_basic.MlknnBasic(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), 
                                                                           k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer)
        label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
        from mltools.ml_hierarchical import MlHierarchical
        classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record)
    
    elif classifier_name == 'mlknn-threshold-tree':
        def get_neighbours(sample, k, train_gen):
            return find_closest_points_sorted(sample, train_gen, [sample], k, distance)
        k = int(k)
        from mlknn import mlknn_threshold
        mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_threshold.MlknnThreshold(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), 
                                                                           k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer)
        label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
        from mltools.ml_hierarchical import MlHierarchical
        classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record)
    
    elif classifier_name == 'mlknn-tensembled-tree':
        def get_neighbours(sample, k, train_gen):
            return find_closest_points_sorted(sample, train_gen, [sample], k, distance)
        k = map(int, k.strip().split(','))
        PRINTER("loaded k-list: "+str(k))
        from mlknn import mlknn_tensembled
        mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_tensembled.MlknnTEnsembled(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen),
                                                                                                      k, get_labels_of_record_arg, lambda x:1, printer)
        label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
        from mltools.ml_hierarchical import MlHierarchical
        classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record)
    
    
    PRINTER("Time taken for training:"+str(start-time()))    
    PRINTER("------------------------")
    PRINTER("---Testing classifier---")
    PRINTER("------------------------")

    classify_oracle = mc2lmc_tomka_blad
    from mltools.multilabel_evaluate import multilabel_evaluate, multilabel_evaluate_printresults
    accuracy, precision, recall, hammingloss, subset01loss, fmeasure = multilabel_evaluate(lambda: test_generator, classify_oracle, classifier.__getattribute__('classify'), len(labels), 
                    [('full label', lambda x: x), ('half label', lambda x: x[:3]), ('low label', lambda x: x[:2])])
    PRINTER("-----------RESULTS-----------")
    multilabel_evaluate_printresults(accuracy, precision, recall, hammingloss, subset01loss, fmeasure, PRINTER)
    return accuracy, precision, recall, hammingloss, subset01loss, fmeasure
 PRINTERMAIN("save_elements_count_path: "+save_elements_count_path)
 PRINTERMAIN("filtered_by: "+str(filtered_by))
 """
 from main_train_classifier_distmat import main
 #curr_accuracy, curr_precision, curr_recall, curr_hammingloss, curr_subset01loss, curr_fmeasure
 measures = [[] for _ in xrange(6)]#6 measures
 for train_generator, test_generator, elements_count, labels, elements_count in gen_train_test_kfold(fname, codeprefixlen, mincodeoccurences, filtered_by, kfold):
     from choose_best_k import evaluate_k_kfold
     train_elements_count = len(train_generator)
     k_evaluation = evaluate_k_kfold(labels, labelsset, lambda: train_generator, train_elements_count, classifier_name, k, smoothing_param, distancematrix, kfold)
     #sys.exit(1)
     sub_measures = main(train_generator, labels, elements_count, classifier_name, k, smoothing_param, distancematrix, test_generator)
     for i, sub_m in enumerate(sub_measures):
         measures[i].append(sub_m)
 #summarize, each :
 final_measures = [{} for _ in xrange(6)]
 for ind, final_measure in enumerate(final_measures):
     for key in measures[0][0].keys():
         final_measure[key] = 0
     for key in measures[0][0].keys():
         for measure in measures[ind]:
             final_measure[key] += measure[key]
     for key in measures[0][0].keys():
         final_measure[key] /= len(measures[0])
 
 from mltools.multilabel_evaluate import multilabel_evaluate_printresults
 PRINTERMAIN("---FINAL RESULTS---")
 def PRINTER_PARAM(x):
     print x
 multilabel_evaluate_printresults(*final_measures)
     kfold = int(sys.argv[8])
 except:
     print '8th argument: how many folds.'
     sys.exit(1)
 try:
     filtered_by = sys.argv[9:]
 except:
     print '8th argument: field names which have to occur for the record to be considered.'
     sys.exit(1)
 """
 PRINTERMAIN("Input arguments:")
 PRINTERMAIN("fname: "+fname)
 PRINTERMAIN("codeprefixlen: "+str(codeprefixlen))
 PRINTERMAIN("mincodeoccurences: "+str(mincodeoccurences))
 PRINTERMAIN("save_train_generator_path: "+save_train_generator_path)
 PRINTERMAIN("save_test_generator_path: "+save_test_generator_path)
 PRINTERMAIN("save_labels_path: "+save_labels_path)
 PRINTERMAIN("save_elements_count_path: "+save_elements_count_path)
 PRINTERMAIN("filtered_by: "+str(filtered_by))
 """
 labels, labelsset, prefix_code_generator, elements_count = load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences, filtered_by)
 from main_train_classifier_distmat import main
 #curr_accuracy, curr_precision, curr_recall, curr_hammingloss, curr_subset01loss, curr_fmeasure
 final_measures = evaluate_k_fold(classifier_name, k, smoothing_param, distancematrix, kfold, labels, labelsset, prefix_code_generator, elements_count)
 
 from mltools.multilabel_evaluate import multilabel_evaluate_printresults
 PRINTERMAIN("---FINAL RESULTS---")
 def PRINTER_PARAM(x):
     print x
 multilabel_evaluate_printresults(*(final_measures+[PRINTER_PARAM]))
    PRINTER("load_hierarchical_path: "+str(load_hierarchical_path))
    PRINTER("load_train_generator: "+str(load_train_generator))
    PRINTER("lenlabels_path: "+str(lenlabels_path))

    log_level = logging.INFO
    logging.basicConfig(level=log_level)
    
    from tools.pickle_tools import read_pickle
    hierarhical_mlknn = read_pickle(load_hierarchical_path)
    test_generator = read_pickle(load_train_generator) 
    lenlabels = read_pickle(lenlabels_path) 
    
    #print "Finding out if the ML-hierarchical has internal data..."
    #check_internal_data(hierarhical_mlknn)
    
    
    print "----------------------------------------------------"
    #print "MLKNN:"
    #print "PRINTING TEST SAMPLES:"
    #for i in test_generator:
    #    print classify_oracle(i)
    
    classify_oracle = lambda x: mc2lmc_tomka_blad(x)
    multilabel_evaluate_printresults(lambda: test_generator, classify_oracle, hierarhical_mlknn.classify, lenlabels, 
                    {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]})
    
    #print "----------------------------------------------------"
    #print "STUPID KNN:"
    #multilabel_evaluate_printresults(test_generator, classify_oracle, hierarhical_mlknn.classify_stupid, len(labels), 
    #                #{'full label': lambda x: x, 'short label': lambda x: x[:1]})
    #                {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]})
                                                                           k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer)
    
    elif classifier_name == 'mlknn-tensembled-tree':
        k = map(int, k.strip().split(','))
        PRINTER("loaded k-list: "+str(k))
        from mlknn import mlknn_tensembled
        mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_tensembled.MlknnTEnsembled(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), 
                                                                                                      k, get_labels_of_record_arg, lambda x:1, printer)

    label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
    
    
    from mltools.ml_hierarchical import MlHierarchical
    classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record)
        
    PRINTER("Time taken for training:"+str(start-time()))
    
    PRINTER("------------------------")
    PRINTER("---Testing classifier---")
    PRINTER("------------------------")
    test_generator = read_pickle(load_test_generator) 
    labels = read_pickle(load_labels_path)


    classify_oracle = mc2lmc_tomka_blad
    from mltools.multilabel_evaluate import multilabel_evaluate_printresults
    PRINTER("-----------RESULTS-----------")
    multilabel_evaluate_printresults(lambda: test_generator, classify_oracle, classifier.__getattribute__('classify'), len(labels), 
                    [('full label', lambda x: x), ('half label', lambda x: x[:3]), ('low label', lambda x: x[:2])], labels)
    
    
Beispiel #7
0
        train_elements_count = len(train_generator)
        k_evaluation = evaluate_k_kfold(labels, labelsset,
                                        lambda: train_generator,
                                        train_elements_count, classifier_name,
                                        k, smoothing_param, distancematrix,
                                        kfold)
        #sys.exit(1)
        sub_measures = main(train_generator, labels, elements_count,
                            classifier_name, k, smoothing_param,
                            distancematrix, test_generator)
        for i, sub_m in enumerate(sub_measures):
            measures[i].append(sub_m)
    #summarize, each :
    final_measures = [{} for _ in xrange(6)]
    for ind, final_measure in enumerate(final_measures):
        for key in measures[0][0].keys():
            final_measure[key] = 0
        for key in measures[0][0].keys():
            for measure in measures[ind]:
                final_measure[key] += measure[key]
        for key in measures[0][0].keys():
            final_measure[key] /= len(measures[0])

    from mltools.multilabel_evaluate import multilabel_evaluate_printresults
    PRINTERMAIN("---FINAL RESULTS---")

    def PRINTER_PARAM(x):
        print x

    multilabel_evaluate_printresults(*final_measures)
Beispiel #8
0
        filtered_by = sys.argv[9:]
    except:
        print '8th argument: field names which have to occur for the record to be considered.'
        sys.exit(1)
    """
    PRINTERMAIN("Input arguments:")
    PRINTERMAIN("fname: "+fname)
    PRINTERMAIN("codeprefixlen: "+str(codeprefixlen))
    PRINTERMAIN("mincodeoccurences: "+str(mincodeoccurences))
    PRINTERMAIN("save_train_generator_path: "+save_train_generator_path)
    PRINTERMAIN("save_test_generator_path: "+save_test_generator_path)
    PRINTERMAIN("save_labels_path: "+save_labels_path)
    PRINTERMAIN("save_elements_count_path: "+save_elements_count_path)
    PRINTERMAIN("filtered_by: "+str(filtered_by))
    """
    labels, labelsset, prefix_code_generator, elements_count = load_labels_codegen_elemcnt(
        fname, codeprefixlen, mincodeoccurences, filtered_by)
    from main_train_classifier_distmat import main
    #curr_accuracy, curr_precision, curr_recall, curr_hammingloss, curr_subset01loss, curr_fmeasure
    final_measures = evaluate_k_fold(classifier_name, k, smoothing_param,
                                     distancematrix, kfold, labels, labelsset,
                                     prefix_code_generator, elements_count)

    from mltools.multilabel_evaluate import multilabel_evaluate_printresults
    PRINTERMAIN("---FINAL RESULTS---")

    def PRINTER_PARAM(x):
        print x

    multilabel_evaluate_printresults(*(final_measures + [PRINTER_PARAM]))
     labels_path = sys.argv[3]
 except:
     print '3d argument expected: path to a pickled labels list.'
     sys.exit(1)
 try:
     classify_method_name = sys.argv[4]
 except:
     print '4th argument expected: classify method name.'
     sys.exit(1)
 
 #PRINTER("Input arguments:")
 #PRINTER("load_classifier_path: "+str(load_classifier_path))
 #PRINTER("load_test_generator: "+str(load_test_generator))
 #PRINTER("labels_path: "+str(labels_path))
 #PRINTER("classify_method_name: "+str(classify_method_name))
 
 from tools.pickle_tools import read_pickle
 classifier = read_pickle(load_classifier_path)
 test_generator = read_pickle(load_test_generator) 
 labels = read_pickle(labels_path)
 
 #print "Finding out if the ML-hierarchical has internal data..."
 #check_internal_data(hierarhical_mlknn)
 classify_oracle = mc2lmc_tomka_blad
 
 #print "----------------------------------------------------"
 #print "Hierachical MLKNN:"
 PRINTER("-----------RESULTS-----------")
 multilabel_evaluate_printresults(lambda: test_generator, classify_oracle, classifier.__getattribute__(classify_method_name), len(labels), 
                 {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]}, labels)
 
Beispiel #10
0
    log_level = logging.INFO
    logging.basicConfig(level=log_level)

    from tools.pickle_tools import read_pickle
    hierarhical_mlknn = read_pickle(load_hierarchical_path)
    test_generator = read_pickle(load_train_generator)
    lenlabels = read_pickle(lenlabels_path)

    #print "Finding out if the ML-hierarchical has internal data..."
    #check_internal_data(hierarhical_mlknn)

    print "----------------------------------------------------"
    #print "MLKNN:"
    #print "PRINTING TEST SAMPLES:"
    #for i in test_generator:
    #    print classify_oracle(i)

    classify_oracle = lambda x: mc2lmc_tomka_blad(x)
    multilabel_evaluate_printresults(
        lambda: test_generator, classify_oracle, hierarhical_mlknn.classify,
        lenlabels, {
            'full label': lambda x: x,
            'half label': lambda x: x[:3],
            'low label': lambda x: x[:2]
        })

    #print "----------------------------------------------------"
    #print "STUPID KNN:"
    #multilabel_evaluate_printresults(test_generator, classify_oracle, hierarhical_mlknn.classify_stupid, len(labels),
    #                #{'full label': lambda x: x, 'short label': lambda x: x[:1]})
    #                {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]})
    label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)

    PRINTER("Training hierarchical mlknn...")
    from time import time
    start = time()
    hierarhical_mlknn = MlHierarchical(train_generator, mlknn_callable,
                                       label_mappings, get_labels_of_record)
    PRINTER("time taken for training:" + str(start - time()))

    PRINTER("Testing hierarchical mlknn fractional...")
    test_generator = read_pickle(load_test_generator)
    labels = read_pickle(load_labels_path)

    #print "Finding out if the ML-hierarchical has internal data..."
    #check_internal_data(hierarhical_mlknn)
    classify_oracle = mc2lmc_tomka_blad

    #print "----------------------------------------------------"
    #print "Hierachical MLKNN:"
    from mltools.multilabel_evaluate import multilabel_evaluate_printresults
    PRINTER("-----------RESULTS-----------")
    multilabel_evaluate_printresults(
        lambda: test_generator, classify_oracle,
        hierarhical_mlknn.__getattribute__('classify'), len(labels), {
            'full label': lambda x: x,
            'half label': lambda x: x[:3],
            'low label': lambda x: x[:2]
        }, labels)

    #from tools.pickle_tools import save_pickle
    #save_pickle(hierarhical_mlknn, save_classifier_path)
 
 get_labels_of_record = mc2lmc_tomka_blad
 mlknn_callable = lambda train_gen, get_labels_of_record_arg: MlKnn(train_gen, zbldistance, find_closest_points, 
                      k, smoothingparam, get_labels_of_record_arg)
 
 label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
 
 PRINTER("Training hierarchical mlknn...")
 from time import time
 start = time()
 hierarhical_mlknn = MlHierarchical(train_generator, mlknn_callable, label_mappings, get_labels_of_record)
 PRINTER("time taken for training:"+str(start-time()))
 
 PRINTER("Testing hierarchical mlknn...")
 test_generator = read_pickle(load_test_generator) 
 labels = read_pickle(load_labels_path)
 
 #print "Finding out if the ML-hierarchical has internal data..."
 #check_internal_data(hierarhical_mlknn)
 classify_oracle = mc2lmc_tomka_blad
 
 #print "----------------------------------------------------"
 #print "Hierachical MLKNN:"
 from mltools.multilabel_evaluate import multilabel_evaluate_printresults
 PRINTER("-----------RESULTS-----------")
 multilabel_evaluate_printresults(lambda: test_generator, classify_oracle, hierarhical_mlknn.__getattribute__('classify'), len(labels), 
                 {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]}, labels)
 
 #from tools.pickle_tools import save_pickle
 #save_pickle(hierarhical_mlknn, save_classifier_path)