levels_wanted - 1) PRINTER("Labels after filtering by divisability: " + str(biggest_labels)) PRINTER("Number of labels: " + str(len(biggest_labels))) biggest_labels = biggest_labels[:biggest_labels_cnt] PRINTER("Labels after cutting only frequent labels: " + str(biggest_labels)) PRINTER("Number of labels: " + str(len(biggest_labels))) labelsset = set(biggest_labels) PRINTER(biggest_labels) #gen filtered records: prefix_code_generator = lambda: gen_record_filteredbylabels( prefixed_rec_generator, labelsset) PRINTER("counting elements...") elements_count = len(list(prefix_code_generator())) PRINTER("number of elements:" + str(elements_count)) codes_generator = lambda: gen_lmc(prefix_code_generator) labelperdocuments_counts = get_labelperdocuments_counts(codes_generator) PRINTER("labels per document statistics:" + str(labelperdocuments_counts)) l = list(labelperdocuments_counts.iteritems()) PRINTER("average number of labels per document:" + str(sum(le[0] * le[1] for le in l) / sum(le[1] for le in l))) PRINTER("saving...") write_zbl_records(open(savefname, 'w'), prefix_code_generator())
#choosing shuffling_cnt elements in random: PRINTER("shuffling in random") import random chosen_records = random.sample(list(rec_generator_first()), shuffling_cnt) rec_generator = lambda: chosen_records prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator) #generate labels PRINTER("generating labels...") labels_counts = get_labels_counts(prefix_code_generator, mincodeoccurences) #PRINTER("labels generated." #PRINTER(sorted(labels_counts, key = lambda x: x[1], reverse = True) biggest_labels = map(lambda x: x[0], sorted(labels_counts, key = lambda x: x[1], reverse = True))[:biggest_labels_cnt] labelsset = set(biggest_labels) PRINTER(biggest_labels) #gen filtered records: prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset) PRINTER("counting elements...") elements_count = len(list(prefix_code_generator())) PRINTER("number of elements:"+str(elements_count)) codes_generator = lambda: gen_lmc(prefix_code_generator) PRINTER("labels per document statistics:"+str(get_labelperdocuments_counts(codes_generator))) PRINTER("saving...") write_zbl_records(open(savefname, 'w'), prefix_code_generator())
reverse = True)) biggest_labels = filter_by_divisability(biggest_labels, 2, levels_wanted-1) PRINTER("Labels after filtering by divisability: "+str(biggest_labels)) PRINTER("Number of labels: "+str(len(biggest_labels))) biggest_labels = biggest_labels[:biggest_labels_cnt] PRINTER("Labels after cutting only frequent labels: "+str(biggest_labels)) PRINTER("Number of labels: "+str(len(biggest_labels))) labelsset = set(biggest_labels) PRINTER(biggest_labels) #gen filtered records: prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset) PRINTER("counting elements...") elements_count = len(list(prefix_code_generator())) PRINTER("number of elements:"+str(elements_count)) codes_generator = lambda: gen_lmc(prefix_code_generator) labelperdocuments_counts = get_labelperdocuments_counts(codes_generator) PRINTER("labels per document statistics:"+str(labelperdocuments_counts)) l = list(labelperdocuments_counts.iteritems()) PRINTER("average number of labels per document:"+str(sum(le[0]*le[1] for le in l)/sum(le[1] for le in l))) PRINTER("saving...") write_zbl_records(open(savefname, 'w'), prefix_code_generator())