def split_train_test_highest(fname, codeprefixlen, mincodeoccurences, filtered_by): #prepare generators rec_generator = lambda:gen_record(fname, filtered_by) prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator) #generate labels PRINTER('generating labels...') labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) PRINTER('labels generated:') PRINTER(str(labels)) #gen filtered records: labelsset = set(labels) prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset) PRINTER('counting elements...') elements_count = len(list(prefix_code_generator())) PRINTER('number of elements' +str(elements_count)) #split into training and testing samples PRINTER('splitting into training and testing...') train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10)) train_generator = lambda:gen_record_fromshifts(prefix_code_generator, train_inds) test_generator = lambda:gen_record_fromshifts(prefix_code_generator, test_inds) PRINTER('splitted.') elements_count = len(list(prefix_code_generator())) return train_generator, test_generator, elements_count, labels, elements_count
def gen_train_test_kfold(labelsset, prefix_code_generator, elements_count, kfold): #split into training and testing samples buckets = [[] for _ in xrange(kfold)] for ind in xrange(elements_count): buckets[ind%kfold].append(ind) #print "buckets:", buckets for test_bucket_ind in xrange(kfold): test_inds = buckets[test_bucket_ind] train_inds = sorted(reduce(lambda a, b: a+b, buckets[:test_bucket_ind]+buckets[test_bucket_ind+1:])) #print "test_inds:", test_inds, len(test_inds) #print "train_inds:", train_inds, len(train_inds) train_generator = list(gen_record_fromshifts(prefix_code_generator, train_inds)) test_generator = list(gen_record_fromshifts(prefix_code_generator, test_inds)) yield train_generator, test_generator
def gen_train_test_kfold(labelsset, prefix_code_generator, elements_count, kfold): #split into training and testing samples buckets = [[] for _ in xrange(kfold)] for ind in xrange(elements_count): buckets[ind % kfold].append(ind) #print "buckets:", buckets for test_bucket_ind in xrange(kfold): test_inds = buckets[test_bucket_ind] train_inds = sorted( reduce(lambda a, b: a + b, buckets[:test_bucket_ind] + buckets[test_bucket_ind + 1:])) #print "test_inds:", test_inds, len(test_inds) #print "train_inds:", train_inds, len(train_inds) train_generator = list( gen_record_fromshifts(prefix_code_generator, train_inds)) test_generator = list( gen_record_fromshifts(prefix_code_generator, test_inds)) yield train_generator, test_generator
print "generating labels..." labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) labelsset = set(labels) print "labels generated." print labels #gen filtered records: prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset) print "counting elements..." elements_count = len(list(prefix_code_generator())) print "number of elements:", elements_count #split into training and testing samples print "splitting into training and testing..." train_inds, test_inds = randomly_divide(elements_count, int(elements_count/10)) train_generator = lambda: gen_record_fromshifts(prefix_code_generator, train_inds) test_generator = lambda: gen_record_fromshifts(prefix_code_generator, test_inds) print "splitted." #train mlknn: print "training distance..." zbldistance = jaccard_distance.JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps) print "training hierarchical mlknn..." mlknn_callable = lambda train_gen: mlknn.MlKnn(train_gen, zbldistance, find_closest_points.find_closest_points, k, smoothingparam) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) record_mappings = (lambda x: gen_1record_prefixed(x, 2), lambda x: gen_1record_prefixed(x, 3), lambda x: x) hierarhical_mlknn = ml_hierarchical.MlHierarchical(train_generator, mlknn_callable, label_mappings, record_mappings)
labelsset = set(labels) print "labels generated." print labels #gen filtered records: prefix_code_generator = lambda: gen_record_filteredbylabels( prefixed_rec_generator, labelsset) print "counting elements..." elements_count = len(list(prefix_code_generator())) print "number of elements:", elements_count #split into training and testing samples print "splitting into training and testing..." train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10)) train_generator = lambda: gen_record_fromshifts(prefix_code_generator, train_inds) test_generator = lambda: gen_record_fromshifts(prefix_code_generator, test_inds) print "splitted." #train mlknn: print "training distance..." zbldistance = jaccard_distance.JaccardDistance( train_generator, elements_count - int(elements_count / 10), distancetrainingsteps) print "training hierarchical mlknn..." mlknn_callable = lambda train_gen: mlknn.MlKnn( train_gen, zbldistance, find_closest_points.find_closest_points, k, smoothingparam)
distancetype = sys.argv[8] except: print '8th argument expected: type of distance. Available: jac, g0, g1, g2' sys.exit(1) PRINTER('Loading training list...') from tools.pickle_tools import read_pickle all_train_generator_list = read_pickle(load_train_generator_path) PRINTER('Dividing the train_generator_list into training set and validation set...') from tools.randomly_divide import randomly_divide from data_io.zbl_record_generators import gen_record_fromshifts elements_count = len(all_train_generator_list) train_inds, validate_inds = randomly_divide(elements_count, int(elements_count / 5)) train_generator_list = list(gen_record_fromshifts(lambda: all_train_generator_list, train_inds)) validate_generator_list = list(gen_record_fromshifts(lambda: all_train_generator_list, validate_inds)) PRINTER('Loading labels path and elements count...') lenlabels = len(read_pickle(load_labels_path)) elements_count = read_pickle(load_elements_count_path) PRINTER("training distance...") train_generator = lambda: train_generator_list if distancetype=='jac': from mlknn.jaccard_distance import JaccardDistance zbldistance = JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps) else: from mlknn.txt_cosine_distance import TxtCosineDistance zbldistance = TxtCosineDistance(distancetype)