コード例 #1
0
def split_train_test_highest(fname, codeprefixlen, mincodeoccurences, filtered_by):
    #prepare generators
    rec_generator = lambda:gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))
    
    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' +str(elements_count))
    
    #split into training and testing samples
    PRINTER('splitting into training and testing...')
    train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10))
    train_generator = lambda:gen_record_fromshifts(prefix_code_generator, train_inds)
    test_generator = lambda:gen_record_fromshifts(prefix_code_generator, test_inds)
    PRINTER('splitted.')
    
    elements_count = len(list(prefix_code_generator()))
    return train_generator, test_generator, elements_count, labels, elements_count
コード例 #2
0
 #generate labels
 print "generating labels..."
 labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
 labelsset = set(labels)
 print "labels generated."
 print labels
 
 #gen filtered records:
 prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
 print "counting elements..."
 elements_count = len(list(prefix_code_generator()))
 print "number of elements:", elements_count
 
 #split into training and testing samples
 print "splitting into training and testing..."
 train_inds, test_inds = randomly_divide(elements_count, int(elements_count/10))
 train_generator = lambda: gen_record_fromshifts(prefix_code_generator, train_inds)
 test_generator = lambda: gen_record_fromshifts(prefix_code_generator, test_inds)
 print "splitted."
 
 #train mlknn:
 print "training distance..."
 zbldistance = jaccard_distance.JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps)
 
 print "training hierarchical mlknn..."
 mlknn_callable = lambda train_gen: mlknn.MlKnn(train_gen, zbldistance, find_closest_points.find_closest_points, 
                      k, smoothingparam)
 
 
 label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
 record_mappings = (lambda x: gen_1record_prefixed(x, 2), lambda x: gen_1record_prefixed(x, 3), lambda x: x)
コード例 #3
0
    print "generating labels..."
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    labelsset = set(labels)
    print "labels generated."
    print labels

    #gen filtered records:
    prefix_code_generator = lambda: gen_record_filteredbylabels(
        prefixed_rec_generator, labelsset)
    print "counting elements..."
    elements_count = len(list(prefix_code_generator()))
    print "number of elements:", elements_count

    #split into training and testing samples
    print "splitting into training and testing..."
    train_inds, test_inds = randomly_divide(elements_count,
                                            int(elements_count / 10))
    train_generator = lambda: gen_record_fromshifts(prefix_code_generator,
                                                    train_inds)
    test_generator = lambda: gen_record_fromshifts(prefix_code_generator,
                                                   test_inds)
    print "splitted."

    #train mlknn:
    print "training distance..."
    zbldistance = jaccard_distance.JaccardDistance(
        train_generator, elements_count - int(elements_count / 10),
        distancetrainingsteps)

    print "training hierarchical mlknn..."
    mlknn_callable = lambda train_gen: mlknn.MlKnn(
        train_gen, zbldistance, find_closest_points.find_closest_points, k,
コード例 #4
0
 try:
     distancetype = sys.argv[8]
 except:
     print '8th argument expected: type of distance. Available: jac, g0, g1, g2'
     sys.exit(1)
     
 
 PRINTER('Loading training list...')
 from tools.pickle_tools import read_pickle
 all_train_generator_list = read_pickle(load_train_generator_path)
 
 PRINTER('Dividing the train_generator_list into training set and validation set...')
 from tools.randomly_divide import randomly_divide
 from data_io.zbl_record_generators import gen_record_fromshifts
 elements_count = len(all_train_generator_list)
 train_inds, validate_inds = randomly_divide(elements_count, int(elements_count / 5))
 train_generator_list = list(gen_record_fromshifts(lambda: all_train_generator_list, train_inds))
 validate_generator_list = list(gen_record_fromshifts(lambda: all_train_generator_list, validate_inds))
 
 PRINTER('Loading labels path and elements count...')
 lenlabels = len(read_pickle(load_labels_path)) 
 elements_count = read_pickle(load_elements_count_path) 
 
 PRINTER("training distance...")
 train_generator = lambda: train_generator_list
 if distancetype=='jac':
     from mlknn.jaccard_distance import JaccardDistance
     zbldistance = JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps)
 else:
     from mlknn.txt_cosine_distance import TxtCosineDistance 
     zbldistance = TxtCosineDistance(distancetype)