def split_train_test_highest(fname, codeprefixlen, mincodeoccurences, filtered_by): #prepare generators rec_generator = lambda:gen_record(fname, filtered_by) prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator) #generate labels PRINTER('generating labels...') labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) PRINTER('labels generated:') PRINTER(str(labels)) #gen filtered records: labelsset = set(labels) prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset) PRINTER('counting elements...') elements_count = len(list(prefix_code_generator())) PRINTER('number of elements' +str(elements_count)) #split into training and testing samples PRINTER('splitting into training and testing...') train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10)) train_generator = lambda:gen_record_fromshifts(prefix_code_generator, train_inds) test_generator = lambda:gen_record_fromshifts(prefix_code_generator, test_inds) PRINTER('splitted.') elements_count = len(list(prefix_code_generator())) return train_generator, test_generator, elements_count, labels, elements_count
#generate labels print "generating labels..." labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) labelsset = set(labels) print "labels generated." print labels #gen filtered records: prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset) print "counting elements..." elements_count = len(list(prefix_code_generator())) print "number of elements:", elements_count #split into training and testing samples print "splitting into training and testing..." train_inds, test_inds = randomly_divide(elements_count, int(elements_count/10)) train_generator = lambda: gen_record_fromshifts(prefix_code_generator, train_inds) test_generator = lambda: gen_record_fromshifts(prefix_code_generator, test_inds) print "splitted." #train mlknn: print "training distance..." zbldistance = jaccard_distance.JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps) print "training hierarchical mlknn..." mlknn_callable = lambda train_gen: mlknn.MlKnn(train_gen, zbldistance, find_closest_points.find_closest_points, k, smoothingparam) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) record_mappings = (lambda x: gen_1record_prefixed(x, 2), lambda x: gen_1record_prefixed(x, 3), lambda x: x)
print "generating labels..." labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) labelsset = set(labels) print "labels generated." print labels #gen filtered records: prefix_code_generator = lambda: gen_record_filteredbylabels( prefixed_rec_generator, labelsset) print "counting elements..." elements_count = len(list(prefix_code_generator())) print "number of elements:", elements_count #split into training and testing samples print "splitting into training and testing..." train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10)) train_generator = lambda: gen_record_fromshifts(prefix_code_generator, train_inds) test_generator = lambda: gen_record_fromshifts(prefix_code_generator, test_inds) print "splitted." #train mlknn: print "training distance..." zbldistance = jaccard_distance.JaccardDistance( train_generator, elements_count - int(elements_count / 10), distancetrainingsteps) print "training hierarchical mlknn..." mlknn_callable = lambda train_gen: mlknn.MlKnn( train_gen, zbldistance, find_closest_points.find_closest_points, k,
try: distancetype = sys.argv[8] except: print '8th argument expected: type of distance. Available: jac, g0, g1, g2' sys.exit(1) PRINTER('Loading training list...') from tools.pickle_tools import read_pickle all_train_generator_list = read_pickle(load_train_generator_path) PRINTER('Dividing the train_generator_list into training set and validation set...') from tools.randomly_divide import randomly_divide from data_io.zbl_record_generators import gen_record_fromshifts elements_count = len(all_train_generator_list) train_inds, validate_inds = randomly_divide(elements_count, int(elements_count / 5)) train_generator_list = list(gen_record_fromshifts(lambda: all_train_generator_list, train_inds)) validate_generator_list = list(gen_record_fromshifts(lambda: all_train_generator_list, validate_inds)) PRINTER('Loading labels path and elements count...') lenlabels = len(read_pickle(load_labels_path)) elements_count = read_pickle(load_elements_count_path) PRINTER("training distance...") train_generator = lambda: train_generator_list if distancetype=='jac': from mlknn.jaccard_distance import JaccardDistance zbldistance = JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps) else: from mlknn.txt_cosine_distance import TxtCosineDistance zbldistance = TxtCosineDistance(distancetype)