Ejemplo n.º 1
0
def split_train_test_highest(fname, codeprefixlen, mincodeoccurences, filtered_by):
    #prepare generators
    rec_generator = lambda:gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))
    
    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' +str(elements_count))
    
    #split into training and testing samples
    PRINTER('splitting into training and testing...')
    train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10))
    train_generator = lambda:gen_record_fromshifts(prefix_code_generator, train_inds)
    test_generator = lambda:gen_record_fromshifts(prefix_code_generator, test_inds)
    PRINTER('splitted.')
    
    elements_count = len(list(prefix_code_generator()))
    return train_generator, test_generator, elements_count, labels, elements_count
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences, filtered_by):
    #prepare generators
    rec_generator = lambda:gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))
    
    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' +str(elements_count))
    
    return labels, labelsset, prefix_code_generator, elements_count
Ejemplo n.º 3
0
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences,
                                filtered_by):
    #prepare generators
    rec_generator = lambda: gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda: gen_record_prefixed(
        rec_generator, codeprefixlen)
    prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))

    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda: gen_record_filteredbylabels(
        prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' + str(elements_count))

    return labels, labelsset, prefix_code_generator, elements_count
 logging.basicConfig(level=log_level)
 
 #prepare generators
 rec_generator = lambda: gen_record(fname, filtered_by)
 prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen)
 prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
 
 #generate labels
 print "generating labels..."
 labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
 labelsset = set(labels)
 print "labels generated."
 print labels
 
 #gen filtered records:
 prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
 print "counting elements..."
 elements_count = len(list(prefix_code_generator()))
 print "number of elements:", elements_count
 
 #split into training and testing samples
 print "splitting into training and testing..."
 train_inds, test_inds = randomly_divide(elements_count, int(elements_count/10))
 train_generator = lambda: gen_record_fromshifts(prefix_code_generator, train_inds)
 test_generator = lambda: gen_record_fromshifts(prefix_code_generator, test_inds)
 print "splitted."
 
 #train mlknn:
 print "training distance..."
 zbldistance = jaccard_distance.JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps)
 
Ejemplo n.º 5
0
 #choosing shuffling_cnt elements in random:
 PRINTER("shuffling in random")
 import random
 chosen_records = random.sample(list(rec_generator_first()), shuffling_cnt)
 rec_generator = lambda: chosen_records
 
 prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen)
 prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
 
 #generate labels
 PRINTER("generating labels...")
 labels_counts = get_labels_counts(prefix_code_generator, mincodeoccurences)
 #PRINTER("labels generated."
 #PRINTER(sorted(labels_counts, key = lambda x: x[1], reverse = True)
 biggest_labels = map(lambda x: x[0], sorted(labels_counts, key = lambda x: x[1], 
                                             reverse = True))[:biggest_labels_cnt]
 labelsset = set(biggest_labels)
 PRINTER(biggest_labels)
 
 #gen filtered records:
 prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
 PRINTER("counting elements...")
 elements_count = len(list(prefix_code_generator()))
 PRINTER("number of elements:"+str(elements_count))
 
 
 codes_generator = lambda: gen_lmc(prefix_code_generator)
 PRINTER("labels per document statistics:"+str(get_labelperdocuments_counts(codes_generator)))
 
 PRINTER("saving...")
 write_zbl_records(open(savefname, 'w'), prefix_code_generator())