from tools.pickle_tools import read_pickle
 train_generator_list = read_pickle(load_train_generator_path)
 
 PRINTER('Loading labels path and elements count...')
 lenlabels = len(read_pickle(load_labels_path)) 
 elements_count = read_pickle(load_elements_count_path) 
 
 PRINTER("training distance...")
 train_generator = lambda: train_generator_list
 if distancetype=='jac':
     from mlknn.jaccard_distance import JaccardDistance
     zbldistance = JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps)
 else:
     from mlknn.txt_cosine_distance import TxtCosineDistance 
     zbldistance = TxtCosineDistance(distancetype)
 
 PRINTER("Finding label list...")
 get_labels_of_record = mc2lmc_tomka_blad
 find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1)
 
 PRINTER("Training MLKNN...")
 from time import time
 start = time()
 mlknn_single = MlKnnFractional(train_generator, zbldistance, find_closest_points, 
                      k, get_labels_of_record)
 PRINTER("Time taken for training:"+str(start-time()))
 
 from tools.pickle_tools import save_pickle
 PRINTER("MLKNN: pickling the classifier...")
 save_pickle(mlknn_single, save_classifier_path)
 
 
 #train mlknn:
 print "training distance..."
 zbldistance = jaccard_distance.JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps)
 
 print "training hierarchical mlknn..."
 mlknn_callable = lambda train_gen: mlknn.MlKnn(train_gen, zbldistance, find_closest_points.find_closest_points, 
                      k, smoothingparam)
 
 
 label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
 record_mappings = (lambda x: gen_1record_prefixed(x, 2), lambda x: gen_1record_prefixed(x, 3), lambda x: x)
 hierarhical_mlknn = ml_hierarchical.MlHierarchical(train_generator, mlknn_callable, label_mappings, record_mappings)
 
 from tools.pickle_tools import save_pickle
 save_pickle(hierarhical_mlknn.mltree.content, save_hierarchical_path+"mlknn")
 save_pickle(hierarhical_mlknn, save_hierarchical_path)
 
 save_pickle(list(train_generator()), save_train_generator_path)
 save_pickle(len(labels), save_lenlabels_path)
 
 
 classify_oracle = lambda x: mc2lmc_tomka_blad(x)
 print "----------------------------------------------------"
 print "MLKNN:"
 print "PRINTING TEST SAMPLES:"
 for i in test_generator():
     print classify_oracle(i)
 
 multilabel_evaluate_printresults(test_generator, classify_oracle, hierarhical_mlknn.classify, len(labels), 
                 {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]})
Ejemplo n.º 3
0
    PRINTER("training distance...")
    train_generator = lambda: train_generator_list
    if distancetype == 'jac':
        from mlknn.jaccard_distance import JaccardDistance
        zbldistance = JaccardDistance(
            train_generator, elements_count - int(elements_count / 10),
            distancetrainingsteps)
    else:
        from mlknn.txt_cosine_distance import TxtCosineDistance
        zbldistance = TxtCosineDistance(distancetype)

    PRINTER("Finding label list...")
    get_labels_of_record = mc2lmc_tomka_blad
    find_all_labels = lambda frecords: get_labels_min_occurence(
        lambda: gen_lmc(frecords), 1)

    PRINTER("Training MLKNN...")
    from time import time
    start = time()
    mlknn_single = MlKnnFractionalEnsembledStrongest(train_generator,
                                                     zbldistance,
                                                     find_closest_points,
                                                     k_list,
                                                     get_labels_of_record)
    PRINTER("Time taken for training:" + str(start - time()))

    from tools.pickle_tools import save_pickle
    PRINTER("MLKNN: pickling the classifier...")
    save_pickle(mlknn_single, save_classifier_path)
    elements_count = read_pickle(load_elements_count_path)

    train_generator = lambda: train_generator_list
    #train mlknn:
    PRINTER("Training Distance...")
    zbldistance = JaccardDistance(train_generator,
                                  elements_count - int(elements_count / 10),
                                  distancetrainingsteps)

    get_labels_of_record = mc2lmc_tomka_blad
    find_all_labels = lambda frecords: get_labels_min_occurence(
        lambda: gen_lmc(frecords), 1)

    mlknn_callable = lambda train_gen: MlKnn(
        train_gen, zbldistance, find_closest_points, k, smoothingparam,
        find_all_labels, get_labels_of_record)

    label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
    record_mappings = (lambda x: gen_1record_prefixed(x, 2),
                       lambda x: gen_1record_prefixed(x, 3), lambda x: x)

    PRINTER("Training hierarchical mlknn...")
    from time import time
    start = time()
    hierarhical_mlknn = MlHierarchical(train_generator, mlknn_callable,
                                       label_mappings, record_mappings)
    PRINTER("time taken for training:" + str(start - time()))

    from tools.pickle_tools import save_pickle
    save_pickle(hierarhical_mlknn, save_classifier_path)
    if len(sys.argv) < 5:
        PRINTER("Not enough of argument!")
        exit(1)

    load_train_generator_path = sys.argv[1]
    load_labels_path = sys.argv[2]
    load_elements_count_path = sys.argv[3]
    save_classifier_path = sys.argv[4]

    PRINTER("Input arguments:")
    PRINTER("load_train_generator_path: " + str(load_train_generator_path))
    PRINTER("load_labels_path: " + str(load_labels_path))
    PRINTER("load_elements_count_path: " + str(load_elements_count_path))
    PRINTER("save_classifier_path: " + str(save_classifier_path))

    from tools.pickle_tools import read_pickle
    train_generator_list = read_pickle(load_train_generator_path)
    lenlabels = len(read_pickle(load_labels_path))
    elements_count = read_pickle(load_elements_count_path)

    train_generator = lambda: train_generator_list
    get_labels_of_record = mc2lmc_tomka_blad
    classify_oracle = lambda x: mc2lmc_tomka_blad(x)

    random_classif = WeightedRandomLabelClassifier(train_generator,
                                                   get_labels_of_record,
                                                   classify_oracle)

    from tools.pickle_tools import save_pickle
    save_pickle(random_classif, save_classifier_path)
if __name__ == '__main__':
    if len(sys.argv) < 5:
        PRINTER("Not enough of argument!")
        exit(1)
    
    load_train_generator_path = sys.argv[1]
    load_labels_path = sys.argv[2]
    load_elements_count_path = sys.argv[3]
    save_classifier_path = sys.argv[4]
    
    PRINTER("Input arguments:")
    PRINTER("load_train_generator_path: "+str(load_train_generator_path))
    PRINTER("load_labels_path: "+str(load_labels_path))
    PRINTER("load_elements_count_path: "+str(load_elements_count_path))
    PRINTER("save_classifier_path: "+str(save_classifier_path))
    
    from tools.pickle_tools import read_pickle
    train_generator_list = read_pickle(load_train_generator_path) 
    lenlabels = len(read_pickle(load_labels_path)) 
    elements_count = read_pickle(load_elements_count_path) 
    
    train_generator = lambda: train_generator_list
    get_labels_of_record = mc2lmc_tomka_blad
    find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1)
    classify_oracle = lambda x: mc2lmc_tomka_blad(x)
    
    random_classif = RandomLabelClassifier(train_generator, get_labels_of_record, find_all_labels, classify_oracle)
    
    from tools.pickle_tools import save_pickle
    save_pickle(random_classif, save_classifier_path)
    
 PRINTER("-------------------------------------------")
 
 PRINTER("Loading the input data.")
 from tools.pickle_tools import read_pickle
 train_generator_list = read_pickle(load_train_generator_path) 
 lenlabels = len(read_pickle(load_labels_path)) 
 elements_count = read_pickle(load_elements_count_path) 
 
 train_generator = lambda: train_generator_list
 #train mlknn:
 PRINTER("Training Distance...")
 zbldistance = JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps)
 
 get_labels_of_record = mc2lmc_tomka_blad
 find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1)
 
 mlknn_callable = lambda train_gen: MlKnn(train_gen, zbldistance, find_closest_points, 
                      k, smoothingparam, find_all_labels, get_labels_of_record)
 
 label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
 record_mappings = (lambda x: gen_1record_prefixed(x, 2), lambda x: gen_1record_prefixed(x, 3), lambda x: x)
 
 PRINTER("Training hierarchical mlknn...")
 from time import time
 start = time()
 hierarhical_mlknn = MlHierarchical(train_generator, mlknn_callable, label_mappings, record_mappings)
 PRINTER("time taken for training:"+str(start-time()))
 
 from tools.pickle_tools import save_pickle
 save_pickle(hierarhical_mlknn, save_classifier_path)
 
Ejemplo n.º 8
0
    print "training hierarchical mlknn..."
    mlknn_callable = lambda train_gen: mlknn.MlKnn(
        train_gen, zbldistance, find_closest_points.find_closest_points, k,
        smoothingparam)

    label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
    record_mappings = (lambda x: gen_1record_prefixed(x, 2),
                       lambda x: gen_1record_prefixed(x, 3), lambda x: x)
    hierarhical_mlknn = ml_hierarchical.MlHierarchical(train_generator,
                                                       mlknn_callable,
                                                       label_mappings,
                                                       record_mappings)

    from tools.pickle_tools import save_pickle
    save_pickle(hierarhical_mlknn.mltree.content,
                save_hierarchical_path + "mlknn")
    save_pickle(hierarhical_mlknn, save_hierarchical_path)

    save_pickle(list(train_generator()), save_train_generator_path)
    save_pickle(len(labels), save_lenlabels_path)

    classify_oracle = lambda x: mc2lmc_tomka_blad(x)
    print "----------------------------------------------------"
    print "MLKNN:"
    print "PRINTING TEST SAMPLES:"
    for i in test_generator():
        print classify_oracle(i)

    multilabel_evaluate_printresults(
        test_generator, classify_oracle, hierarhical_mlknn.classify,
        len(labels), {
Ejemplo n.º 9
0
    try:
        save_elements_count_path = sys.argv[7]
    except:
        print '7th argument: path where elements count is to be stored.'
        sys.exit(1)
    try:
        filtered_by = sys.argv[8:]
    except:
        print '8th argument: field names which have to occur for the record to be considered.'
        sys.exit(1)
    
    """
    PRINTERMAIN("Input arguments:")
    PRINTERMAIN("fname: "+fname)
    PRINTERMAIN("codeprefixlen: "+str(codeprefixlen))
    PRINTERMAIN("mincodeoccurences: "+str(mincodeoccurences))
    PRINTERMAIN("save_train_generator_path: "+save_train_generator_path)
    PRINTERMAIN("save_test_generator_path: "+save_test_generator_path)
    PRINTERMAIN("save_labels_path: "+save_labels_path)
    PRINTERMAIN("save_elements_count_path: "+save_elements_count_path)
    PRINTERMAIN("filtered_by: "+str(filtered_by))
    """
    
    train_generator, test_generator, elements_count, labels, elements_count = split_train_test_highest(fname, codeprefixlen, mincodeoccurences, filtered_by)

    from tools.pickle_tools import save_pickle
    save_pickle(list(train_generator()), save_train_generator_path)
    save_pickle(list(test_generator()), save_test_generator_path)
    save_pickle(labels, save_labels_path)
    save_pickle(elements_count, save_elements_count_path)