levels_wanted - 1)

    PRINTER("Labels after filtering by divisability: " + str(biggest_labels))
    PRINTER("Number of labels: " + str(len(biggest_labels)))

    biggest_labels = biggest_labels[:biggest_labels_cnt]

    PRINTER("Labels after cutting only frequent labels: " +
            str(biggest_labels))
    PRINTER("Number of labels: " + str(len(biggest_labels)))

    labelsset = set(biggest_labels)
    PRINTER(biggest_labels)

    #gen filtered records:
    prefix_code_generator = lambda: gen_record_filteredbylabels(
        prefixed_rec_generator, labelsset)
    PRINTER("counting elements...")
    elements_count = len(list(prefix_code_generator()))
    PRINTER("number of elements:" + str(elements_count))

    codes_generator = lambda: gen_lmc(prefix_code_generator)
    labelperdocuments_counts = get_labelperdocuments_counts(codes_generator)
    PRINTER("labels per document statistics:" + str(labelperdocuments_counts))

    l = list(labelperdocuments_counts.iteritems())
    PRINTER("average number of labels per document:" +
            str(sum(le[0] * le[1] for le in l) / sum(le[1] for le in l)))

    PRINTER("saving...")
    write_zbl_records(open(savefname, 'w'), prefix_code_generator())
Ejemplo n.º 2
0
 #choosing shuffling_cnt elements in random:
 PRINTER("shuffling in random")
 import random
 chosen_records = random.sample(list(rec_generator_first()), shuffling_cnt)
 rec_generator = lambda: chosen_records
 
 prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen)
 prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
 
 #generate labels
 PRINTER("generating labels...")
 labels_counts = get_labels_counts(prefix_code_generator, mincodeoccurences)
 #PRINTER("labels generated."
 #PRINTER(sorted(labels_counts, key = lambda x: x[1], reverse = True)
 biggest_labels = map(lambda x: x[0], sorted(labels_counts, key = lambda x: x[1], 
                                             reverse = True))[:biggest_labels_cnt]
 labelsset = set(biggest_labels)
 PRINTER(biggest_labels)
 
 #gen filtered records:
 prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
 PRINTER("counting elements...")
 elements_count = len(list(prefix_code_generator()))
 PRINTER("number of elements:"+str(elements_count))
 
 
 codes_generator = lambda: gen_lmc(prefix_code_generator)
 PRINTER("labels per document statistics:"+str(get_labelperdocuments_counts(codes_generator)))
 
 PRINTER("saving...")
 write_zbl_records(open(savefname, 'w'), prefix_code_generator())
                                             reverse = True))
 
 biggest_labels = filter_by_divisability(biggest_labels, 2, levels_wanted-1)
 
 PRINTER("Labels after filtering by divisability: "+str(biggest_labels))
 PRINTER("Number of labels: "+str(len(biggest_labels)))
 
 biggest_labels = biggest_labels[:biggest_labels_cnt]
 
 PRINTER("Labels after cutting only frequent labels: "+str(biggest_labels))
 PRINTER("Number of labels: "+str(len(biggest_labels)))
 
 labelsset = set(biggest_labels)
 PRINTER(biggest_labels)
 
 #gen filtered records:
 prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
 PRINTER("counting elements...")
 elements_count = len(list(prefix_code_generator()))
 PRINTER("number of elements:"+str(elements_count))
 
 
 codes_generator = lambda: gen_lmc(prefix_code_generator)
 labelperdocuments_counts = get_labelperdocuments_counts(codes_generator)
 PRINTER("labels per document statistics:"+str(labelperdocuments_counts))
 
 l = list(labelperdocuments_counts.iteritems())
 PRINTER("average number of labels per document:"+str(sum(le[0]*le[1] for le in l)/sum(le[1] for le in l)))
 
 PRINTER("saving...")
 write_zbl_records(open(savefname, 'w'), prefix_code_generator())