def __cpp_sim_matrix_l_generation_routine__(sim_matrix_path, mscmodel, msc2ix,
                                            secondary_codes_weight,
                                            docid2seccodes):
    #dstmatrixpath = TMPDIR+"/mlevel_similarity_matrix_"+similarity_aggregation_method_l+"_"+base64.b16encode(aux.quick_md5(sim_matrix_path+similarity_aggregation_method_l+str(MIN_COUNT_MSCPRIM)))
    dstmatrixpath = sim_matrix_path + ".msc" + str(
        MIN_COUNT_MSCPRIM) + "_" + similarity_aggregation_method_l
    if not aux.exists(dstmatrixpath):
        msc2wids_list = get_msc2wids_list(msc2ix, mscmodel,
                                          secondary_codes_weight,
                                          docid2seccodes)
        cpp_wrapper.aggregate_simmatrix(sim_matrix_path,
                                        dstmatrixpath,
                                        msc2wids_list,
                                        method=similarity_aggregation_method_l)
    logging.info("[build_msc_tree] Loading simmatrix from: " +
                 str(dstmatrixpath))
    (rows, cols, sim_matrix_l) = matrix_io.fread_smatrix(dstmatrixpath)
    ____validate_cpp_output____(msc2ix, rows)
    return sim_matrix_l
Example #2
0
    train_generator_list = read_pickle(load_train_generator_path)

    PRINTER('Loading labels path and elements count...')
    lenlabels = len(read_pickle(load_labels_path))
    elements_count = read_pickle(load_elements_count_path)

    PRINTER("Finding label list...")
    get_labels_of_record = mc2lmc_tomka_blad
    find_all_labels = lambda frecords: get_labels_min_occurence(
        lambda: gen_lmc(frecords), 1)

    PRINTER("Loading distance matrix...")
    import sys
    sys.path.append(r'../')
    from data_io.matrix_io import fread_smatrix
    (rows, cols, data) = fread_smatrix(distancematrix)
    id2rowind, id2colind = {}, {}
    for ind, id in enumerate(rows):
        id2rowind[id] = ind
    for ind, id in enumerate(cols):
        id2colind[id] = ind

    print "len(train_generator_list):", len(train_generator_list)
    print "len(rows):", len(rows)
    #print "(rows, cols, data):", (rows, cols, data)

    PRINTER("Training classifier...")
    from time import time

    def printer(x):
        #import logging
Example #3
0
def main(train_generator_list, labels, elements_count, classifier_name, k, smoothing_param, distancematrix, test_generator):
    PRINTER("Finding label list...")
    get_labels_of_record = mc2lmc_tomka_blad
    find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1)
    
    PRINTER("Loading distance matrix...")
    import sys
    sys.path.append(r'../')
    from data_io.matrix_io import fread_smatrix
    (rows, cols, data) = fread_smatrix(distancematrix)
    id2rowind, id2colind = {}, {}
    for ind, id in enumerate(rows):
        id2rowind[id] = ind
    for ind, id in enumerate(cols):
        id2colind[id] = ind
        
    #print "len(train_generator_list):",len(train_generator_list)
    #print "len(test_generator_list):",len(test_generator)
    #print "len(rows):",len(rows) 
    #print "(rows, cols, data):", (rows, cols, data)
    
    
    PRINTER("Training classifier...")
    from time import time
    
    def printer(x):
        #import logging
        logging.info('['+classifier_name+']'+x)

    def distance(a, b): 
        try:
            return data[id2rowind[a['an']]][id2colind[b['an']]]
        except:
            return data[id2colind[b['an']]][id2rowind[a['an']]]
        
        
    start = time()
    if classifier_name=='mlknn_basic':
        def get_neighbours(sample, k):
            return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance)
        k = int(k)
        from mlknn import mlknn_basic
        classifier = mlknn_basic.MlknnBasic(train_generator_list, get_neighbours, k, smoothing_param, get_labels_of_record, lambda x:1, printer)
    
    elif classifier_name == 'mlknn_threshold':
        def get_neighbours(sample, k):
            return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance)
        k = int(k)
        from mlknn import mlknn_threshold
        classifier = mlknn_threshold.MlknnThreshold(train_generator_list, get_neighbours, k, smoothing_param, get_labels_of_record, lambda x:1, printer)
        
    elif classifier_name == 'mlknn_tensembled':
        def get_neighbours(sample, k):
            return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance)
        k = map(int, k.strip().split(','))
        PRINTER("loaded k-list: "+str(k))
        from mlknn import mlknn_tensembled
        classifier = mlknn_tensembled.MlknnTEnsembled(train_generator_list, get_neighbours, k, get_labels_of_record, lambda x:1, printer)
    
    elif classifier_name=='mlknn-basic-tree':
        def get_neighbours(sample, k, train_gen):
            return find_closest_points_sorted(sample, train_gen, [sample], k, distance)
        k = int(k)
        from mlknn import mlknn_basic
        mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_basic.MlknnBasic(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), 
                                                                           k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer)
        label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
        from mltools.ml_hierarchical import MlHierarchical
        classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record)
    
    elif classifier_name == 'mlknn-threshold-tree':
        def get_neighbours(sample, k, train_gen):
            return find_closest_points_sorted(sample, train_gen, [sample], k, distance)
        k = int(k)
        from mlknn import mlknn_threshold
        mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_threshold.MlknnThreshold(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), 
                                                                           k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer)
        label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
        from mltools.ml_hierarchical import MlHierarchical
        classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record)
    
    elif classifier_name == 'mlknn-tensembled-tree':
        def get_neighbours(sample, k, train_gen):
            return find_closest_points_sorted(sample, train_gen, [sample], k, distance)
        k = map(int, k.strip().split(','))
        PRINTER("loaded k-list: "+str(k))
        from mlknn import mlknn_tensembled
        mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_tensembled.MlknnTEnsembled(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen),
                                                                                                      k, get_labels_of_record_arg, lambda x:1, printer)
        label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
        from mltools.ml_hierarchical import MlHierarchical
        classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record)
    
    
    PRINTER("Time taken for training:"+str(start-time()))    
    PRINTER("------------------------")
    PRINTER("---Testing classifier---")
    PRINTER("------------------------")

    classify_oracle = mc2lmc_tomka_blad
    from mltools.multilabel_evaluate import multilabel_evaluate, multilabel_evaluate_printresults
    accuracy, precision, recall, hammingloss, subset01loss, fmeasure = multilabel_evaluate(lambda: test_generator, classify_oracle, classifier.__getattribute__('classify'), len(labels), 
                    [('full label', lambda x: x), ('half label', lambda x: x[:3]), ('low label', lambda x: x[:2])])
    PRINTER("-----------RESULTS-----------")
    multilabel_evaluate_printresults(accuracy, precision, recall, hammingloss, subset01loss, fmeasure, PRINTER)
    return accuracy, precision, recall, hammingloss, subset01loss, fmeasure
    k2avgsil = silhouettes(simmatrix, possible_k, clustering_method)
    return max((v, k) for k, v in k2avgsil.iteritems())[1]


if __name__ == "__main__":

    import doctest
    doctest.testmod()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    try:
        simmatrix_path = sys.argv[1]
    except:
        print "Argument expected: similarity matrix path"
        sys.exit(-1)
    print simmatrix_path

    from data_io import matrix_io
    (rows, cols, simmatrix) = matrix_io.fread_smatrix(
        simmatrix_path)  #, datareader=matrix_io.__read_ftabs__, maxrows=1000
    print "matrix size=", len(simmatrix), "x", len(simmatrix[0])
    print simmatrix[0][:10]
    print simmatrix[1][:10]
    print simmatrix[2][:10]
    print simmatrix[3][:10]
    print simmatrix[4][:10]
    print "Selected k=", number_of_clusters(simmatrix,
                                            xrange(2, len(simmatrix)))
from __future__ import division
from itertools import izip

import sys
sys.path.append(r'../')
from data_io.matrix_io import fread_smatrix

filename = sys.argv[1]

(rows, cols, data) = fread_smatrix(filename)

print "(rows, cols, data):", (rows, cols, data)