Example #1
0
def split_train_test_highest(fname, codeprefixlen, mincodeoccurences, filtered_by):
    #prepare generators
    rec_generator = lambda:gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))
    
    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' +str(elements_count))
    
    #split into training and testing samples
    PRINTER('splitting into training and testing...')
    train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10))
    train_generator = lambda:gen_record_fromshifts(prefix_code_generator, train_inds)
    test_generator = lambda:gen_record_fromshifts(prefix_code_generator, test_inds)
    PRINTER('splitted.')
    
    elements_count = len(list(prefix_code_generator()))
    return train_generator, test_generator, elements_count, labels, elements_count
def count_label_statistics(fname, fields):
    """
    Counts the following statistics and prints them. 
    D is the dataset filtered by the condition to contain all of the fields
    L is number of distinct labels in D.
    
    -Label cardinality:  the average number of labels of the examples in D
    -Label density: the average number of labels of the examples in D divided by |L|
    -Bolo11: the percentage of documents that contain at least 2 labels of the same 2 code prefix 
    -Bolo12: the percentage of documents that contain at least 2 labels of the same 3 code prefix 
    -Bolo21: the percentage of documents that contain at least 2 labels of different 2 code prefix 
    -Bolo22: the percentage of documents that contain at least 2 labels of different 3 code prefix 

    """

    all = 0
    labels = set()
    lc = 0
    ld = 0
    bolo11 = 0
    bolo12 = 0
    bolo21 = 0
    bolo22 = 0
    #count statistics
    for lmc in gen_lmc(lambda: gen_record(fname, fields), fields):
        all += 1
        for mc in lmc:
            labels.add(mc)
        lc += len(lmc)
        ld += len(lmc)
        if contains2of_same_prefix(lmc, 2):
            bolo11 += 1
        if contains2of_same_prefix(lmc, 3):
            bolo12 += 1
        if contains2of_diff_prefix(lmc, 2):
            bolo21 += 1
        if contains2of_diff_prefix(lmc, 3):
            bolo22 += 1
        print '[count_label_statistics]: lmc:', lmc, contains2of_same_prefix(
            lmc, 2), contains2of_same_prefix(lmc, 3), contains2of_diff_prefix(
                lmc, 2), contains2of_diff_prefix(lmc, 3)

    #print statistics
    print "lc:", lc / all
    print "ld:", ld / (all * len(labels))

    print "bolo11 contain at least 2 of same 2 code prefix:", bolo11 / all
    print "bolo12 contain at least 2 of same 3 code prefix:", bolo12 / all
    print "bolo21 contain at least 2 of diff 2 code prefix:", bolo21 / all
    print "bolo22 contain at least 2 of diff 3 code prefix:", bolo22 / all
def count_label_statistics(fname, fields):
    """
    Counts the following statistics and prints them. 
    D is the dataset filtered by the condition to contain all of the fields
    L is number of distinct labels in D.
    
    -Label cardinality:  the average number of labels of the examples in D
    -Label density: the average number of labels of the examples in D divided by |L|
    -Bolo11: the percentage of documents that contain at least 2 labels of the same 2 code prefix 
    -Bolo12: the percentage of documents that contain at least 2 labels of the same 3 code prefix 
    -Bolo21: the percentage of documents that contain at least 2 labels of different 2 code prefix 
    -Bolo22: the percentage of documents that contain at least 2 labels of different 3 code prefix 

    """
    
    
    all = 0
    labels = set()
    lc = 0
    ld = 0
    bolo11 = 0
    bolo12 = 0
    bolo21 = 0
    bolo22 = 0
    #count statistics
    for lmc in gen_lmc(lambda: gen_record(fname, fields), fields):
        all+=1
        for mc in lmc:
            labels.add(mc)
        lc += len(lmc)
        ld += len(lmc)
        if contains2of_same_prefix(lmc, 2):
            bolo11 += 1
        if contains2of_same_prefix(lmc, 3):
            bolo12 += 1
        if contains2of_diff_prefix(lmc, 2):
            bolo21 += 1
        if contains2of_diff_prefix(lmc, 3):
            bolo22 += 1
        print '[count_label_statistics]: lmc:', lmc, contains2of_same_prefix(lmc, 2), contains2of_same_prefix(lmc, 3), contains2of_diff_prefix(lmc, 2), contains2of_diff_prefix(lmc, 3)
        
    #print statistics
    print "lc:", lc/all
    print "ld:", ld/(all*len(labels))
    
    print "bolo11 contain at least 2 of same 2 code prefix:", bolo11/all
    print "bolo12 contain at least 2 of same 3 code prefix:", bolo12/all
    print "bolo21 contain at least 2 of diff 2 code prefix:", bolo21/all
    print "bolo22 contain at least 2 of diff 3 code prefix:", bolo22/all
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences, filtered_by):
    #prepare generators
    rec_generator = lambda:gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))
    
    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' +str(elements_count))
    
    return labels, labelsset, prefix_code_generator, elements_count
Example #5
0
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences,
                                filtered_by):
    #prepare generators
    rec_generator = lambda: gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda: gen_record_prefixed(
        rec_generator, codeprefixlen)
    prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))

    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda: gen_record_filteredbylabels(
        prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' + str(elements_count))

    return labels, labelsset, prefix_code_generator, elements_count
    print "k:", k
    print "smoothingparam:", smoothingparam
    print "distancetrainingsteps:", distancetrainingsteps
    print "filtered_by:", filtered_by
    print "save_hierarchical_path:", save_hierarchical_path
    print "save_train_generator_path:", save_train_generator_path
    print "save_lenlabels_path:", save_lenlabels_path
    

    log_level = logging.INFO
    logging.basicConfig(level=log_level)
    
    #prepare generators
    rec_generator = lambda: gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
    
    #generate labels
    print "generating labels..."
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    labelsset = set(labels)
    print "labels generated."
    print labels
    
    #gen filtered records:
    prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    print "counting elements..."
    elements_count = len(list(prefix_code_generator()))
    print "number of elements:", elements_count
    
    #split into training and testing samples
 from tools.pickle_tools import read_pickle
 train_generator_list = read_pickle(load_train_generator_path)
 
 PRINTER('Loading labels path and elements count...')
 lenlabels = len(read_pickle(load_labels_path)) 
 elements_count = read_pickle(load_elements_count_path) 
 
 PRINTER("training distance...")
 train_generator = lambda: train_generator_list
 if distancetype=='jac':
     from mlknn.jaccard_distance import JaccardDistance
     zbldistance = JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps)
 else:
     from mlknn.txt_cosine_distance import TxtCosineDistance 
     zbldistance = TxtCosineDistance(distancetype)
 
 PRINTER("Finding label list...")
 get_labels_of_record = mc2lmc_tomka_blad
 find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1)
 
 PRINTER("Training MLKNN...")
 from time import time
 start = time()
 mlknn_single = MlKnnFractional(train_generator, zbldistance, find_closest_points, 
                      k, get_labels_of_record)
 PRINTER("Time taken for training:"+str(start-time()))
 
 from tools.pickle_tools import save_pickle
 PRINTER("MLKNN: pickling the classifier...")
 save_pickle(mlknn_single, save_classifier_path)
 
Example #8
0
 try:
     filtered_by = sys.argv[7:]
 except:
     print '7th argument: list of the fields to exist in considered records.'
     sys.exit(1)
 
 #prepare generators
 rec_generator_first = lambda: gen_record(fname, filtered_by)
 #choosing shuffling_cnt elements in random:
 PRINTER("shuffling in random")
 import random
 chosen_records = random.sample(list(rec_generator_first()), shuffling_cnt)
 rec_generator = lambda: chosen_records
 
 prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen)
 prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
 
 #generate labels
 PRINTER("generating labels...")
 labels_counts = get_labels_counts(prefix_code_generator, mincodeoccurences)
 #PRINTER("labels generated."
 #PRINTER(sorted(labels_counts, key = lambda x: x[1], reverse = True)
 biggest_labels = map(lambda x: x[0], sorted(labels_counts, key = lambda x: x[1], 
                                             reverse = True))[:biggest_labels_cnt]
 labelsset = set(biggest_labels)
 PRINTER(biggest_labels)
 
 #gen filtered records:
 prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
 PRINTER("counting elements...")
 elements_count = len(list(prefix_code_generator()))
Example #9
0
    except:
        print '8th argument expected: load_test_generator parameter'
        sys.exit(1)

    PRINTER('Loading training list...')
    from tools.pickle_tools import read_pickle
    train_generator_list = read_pickle(load_train_generator_path)

    PRINTER('Loading labels path and elements count...')
    lenlabels = len(read_pickle(load_labels_path))
    elements_count = read_pickle(load_elements_count_path)

    PRINTER("Finding label list...")
    get_labels_of_record = mc2lmc_tomka_blad
    find_all_labels = lambda frecords: get_labels_min_occurence(
        lambda: gen_lmc(frecords), 1)

    PRINTER("Loading distance matrix...")
    import sys
    sys.path.append(r'../')
    from data_io.matrix_io import fread_smatrix
    (rows, cols, data) = fread_smatrix(distancematrix)
    id2rowind, id2colind = {}, {}
    for ind, id in enumerate(rows):
        id2rowind[id] = ind
    for ind, id in enumerate(cols):
        id2colind[id] = ind

    print "len(train_generator_list):", len(train_generator_list)
    print "len(rows):", len(rows)
    #print "(rows, cols, data):", (rows, cols, data)
Example #10
0
def main(train_generator_list, labels, elements_count, classifier_name, k, smoothing_param, distancematrix, test_generator):
    PRINTER("Finding label list...")
    get_labels_of_record = mc2lmc_tomka_blad
    find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1)
    
    PRINTER("Loading distance matrix...")
    import sys
    sys.path.append(r'../')
    from data_io.matrix_io import fread_smatrix
    (rows, cols, data) = fread_smatrix(distancematrix)
    id2rowind, id2colind = {}, {}
    for ind, id in enumerate(rows):
        id2rowind[id] = ind
    for ind, id in enumerate(cols):
        id2colind[id] = ind
        
    #print "len(train_generator_list):",len(train_generator_list)
    #print "len(test_generator_list):",len(test_generator)
    #print "len(rows):",len(rows) 
    #print "(rows, cols, data):", (rows, cols, data)
    
    
    PRINTER("Training classifier...")
    from time import time
    
    def printer(x):
        #import logging
        logging.info('['+classifier_name+']'+x)

    def distance(a, b): 
        try:
            return data[id2rowind[a['an']]][id2colind[b['an']]]
        except:
            return data[id2colind[b['an']]][id2rowind[a['an']]]
        
        
    start = time()
    if classifier_name=='mlknn_basic':
        def get_neighbours(sample, k):
            return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance)
        k = int(k)
        from mlknn import mlknn_basic
        classifier = mlknn_basic.MlknnBasic(train_generator_list, get_neighbours, k, smoothing_param, get_labels_of_record, lambda x:1, printer)
    
    elif classifier_name == 'mlknn_threshold':
        def get_neighbours(sample, k):
            return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance)
        k = int(k)
        from mlknn import mlknn_threshold
        classifier = mlknn_threshold.MlknnThreshold(train_generator_list, get_neighbours, k, smoothing_param, get_labels_of_record, lambda x:1, printer)
        
    elif classifier_name == 'mlknn_tensembled':
        def get_neighbours(sample, k):
            return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance)
        k = map(int, k.strip().split(','))
        PRINTER("loaded k-list: "+str(k))
        from mlknn import mlknn_tensembled
        classifier = mlknn_tensembled.MlknnTEnsembled(train_generator_list, get_neighbours, k, get_labels_of_record, lambda x:1, printer)
    
    elif classifier_name=='mlknn-basic-tree':
        def get_neighbours(sample, k, train_gen):
            return find_closest_points_sorted(sample, train_gen, [sample], k, distance)
        k = int(k)
        from mlknn import mlknn_basic
        mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_basic.MlknnBasic(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), 
                                                                           k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer)
        label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
        from mltools.ml_hierarchical import MlHierarchical
        classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record)
    
    elif classifier_name == 'mlknn-threshold-tree':
        def get_neighbours(sample, k, train_gen):
            return find_closest_points_sorted(sample, train_gen, [sample], k, distance)
        k = int(k)
        from mlknn import mlknn_threshold
        mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_threshold.MlknnThreshold(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), 
                                                                           k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer)
        label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
        from mltools.ml_hierarchical import MlHierarchical
        classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record)
    
    elif classifier_name == 'mlknn-tensembled-tree':
        def get_neighbours(sample, k, train_gen):
            return find_closest_points_sorted(sample, train_gen, [sample], k, distance)
        k = map(int, k.strip().split(','))
        PRINTER("loaded k-list: "+str(k))
        from mlknn import mlknn_tensembled
        mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_tensembled.MlknnTEnsembled(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen),
                                                                                                      k, get_labels_of_record_arg, lambda x:1, printer)
        label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
        from mltools.ml_hierarchical import MlHierarchical
        classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record)
    
    
    PRINTER("Time taken for training:"+str(start-time()))    
    PRINTER("------------------------")
    PRINTER("---Testing classifier---")
    PRINTER("------------------------")

    classify_oracle = mc2lmc_tomka_blad
    from mltools.multilabel_evaluate import multilabel_evaluate, multilabel_evaluate_printresults
    accuracy, precision, recall, hammingloss, subset01loss, fmeasure = multilabel_evaluate(lambda: test_generator, classify_oracle, classifier.__getattribute__('classify'), len(labels), 
                    [('full label', lambda x: x), ('half label', lambda x: x[:3]), ('low label', lambda x: x[:2])])
    PRINTER("-----------RESULTS-----------")
    multilabel_evaluate_printresults(accuracy, precision, recall, hammingloss, subset01loss, fmeasure, PRINTER)
    return accuracy, precision, recall, hammingloss, subset01loss, fmeasure
Example #11
0
 PRINTER('Loading labels path and elements count...')
 lenlabels = len(read_pickle(load_labels_path)) 
 elements_count = read_pickle(load_elements_count_path) 
 
 PRINTER("training distance...")
 train_generator = lambda: train_generator_list
 if distancetype=='jac':
     from mlknn.jaccard_distance import JaccardDistance
     zbldistance = JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps)
 else:
     from mlknn.txt_cosine_distance import TxtCosineDistance 
     zbldistance = TxtCosineDistance(distancetype)
     
 PRINTER("Finding label list...")
 get_labels_of_record = mc2lmc_tomka_blad
 find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1)
 
 PRINTER("Training MLKNN...")
 from time import time
 start = time()
 mlknn_single = MlKnn(train_generator, zbldistance, find_closest_points, 
                      k, smoothingparam, get_labels_of_record)
 PRINTER("Time taken for training:"+str(start-time()))
 
 #PRINTER("MLKNN: training thresholds on validation set...")
 #start = time()
 #mlknn_adjust_thresholds(mlknn_single, validate_generator_list, classify_oracle = get_labels_of_record)
 #PRINTER("Time taken for training thresholds:"+str(start-time()))
 
 from tools.pickle_tools import save_pickle
 PRINTER("MLKNN: pickling the classifier...")