def test_VB_compression_plain_docids(IFfilename): ''' Tests the performance of VB codes using plain docIds. ''' int_bytes = calculate_int_bytes_of_IF(IFfilename) all_postings_docids = flatten(get_postings_list_ints(open(IFfilename).readlines())) t = time.time() bytelists = ic.vb_encode(all_postings_docids) t2 = time.time() - t flattened = flatten(bytelists) # since every entry in the flattened list is 8 bits long, the number of # used bytes is exactly the length of the flattened list compressed_bytes = len(flattened) t = time.time() ic.vb_decode(bytelists) t3 = time.time() - t print 'VB-encdoing: using plain doc ids' print '-------------------------------' print '# of IDs to compress: ' + str(len(all_postings_docids)) print 'Uncompressed (bytes): ' + str(int_bytes) print 'Compressed (bytes): ' + str(compressed_bytes) print 'Compression factor: ' + str(float(int_bytes) / float(compressed_bytes)) print 'Time to compress (s): ' + str(t2) print 'Time to decompress (s): ' + str(t3) print '==================================='
def test_VB_compression_gaps_docids(IFfilename): ''' Tests the performance of VB codes using gaps. ''' int_bytes = calculate_int_bytes_of_IF(IFfilename) lines = open(IFfilename).readlines() int_lists = get_postings_list_ints(lines) gap_lists = get_gaps_from_int_lists(int_lists) all_postings_gapids = flatten(gap_lists) t = time.time() bytelists = ic.vb_encode(all_postings_gapids) t2 = time.time() - t flattened = flatten(bytelists) compressed_bytes = len(flattened) t = time.time() ic.vb_decode(bytelists) t3 = time.time() - t print 'VB-encoding: using gaps instead of plain doc ids' print '-----------------------------------------------' print '# of IDs to compress: ' + str(len(all_postings_gapids)) print 'Uncompressed (bytes): ' + str(int_bytes) print 'Compressed (bytes): ' + str(compressed_bytes) print 'Compression factor: ' + str(float(int_bytes) / float(compressed_bytes)) print 'Time to compress (s): ' + str(t2) print 'Time to decompress (s): ' + str(t3) print '==================================='