def compress_inverted_index(inverted_index, filename): def write_posting_gamma(postinglist): ''' Writing the postinglist into f. :param postinglist A value type is indexing.PostingList :param f A opened file ''' resultpl = pregamma_handle_didlist(postinglist) gammadata = gamma_list(resultpl) length = len(gammadata) f.write(struct.pack('I', length)) for i in range(0, len(gammadata), 8): if i + 8 < len(gammadata): f.write(struct.pack('B', int(gammadata[i:i + 8], 2))) else: # padding f.write(struct.pack('B', int(gammadata[i:], 2))) # write inverted index into file with open(filename, 'wb') as f: for key, postlist in inverted_index.items(): offset = f.tell() SingleStringDict.add_word(word=key, df=postlist.df, post_list_id=offset) try: write_posting_gamma(postlist) except Exception, e: print inverted_index[key]
def compress_inverted_index(inverted_index, filename): def write_posting_gamma(postinglist): ''' Writing the postinglist into f. :param postinglist A value type is indexing.PostingList :param f A opened file ''' resultpl = pregamma_handle_didlist(postinglist) gammadata = gamma_list(resultpl) length = len(gammadata) f.write(struct.pack('I', length)) for i in range(0, len(gammadata), 8): if i + 8 < len(gammadata): f.write(struct.pack('B', int(gammadata[i:i + 8], 2))) else: # padding f.write(struct.pack('B', int(gammadata[i:], 2))) # write inverted index into file with open(filename, 'wb') as f: for key, postlist in inverted_index.items(): offset = f.tell() SingleStringDict.add_word(word = key, df = postlist.df, post_list_id = offset) try: write_posting_gamma(postlist) except Exception, e: print inverted_index[key]
def decompress_inverted_index(filename): invertedindex = {} pldict = SingleStringDict.decompress(filename) with open(filename, 'rb') as plf: for word, offset in pldict.items(): # TODO :there are something cause the invertedindex[word] = seek_inverted_index_file(filename, offset) return (pldict, invertedindex)
def decompress_dict(filename): return SingleStringDict.decompress(filename)
else: # padding f.write(struct.pack('B', int(gammadata[i:], 2))) # write inverted index into file with open(filename, 'wb') as f: for key, postlist in inverted_index.items(): offset = f.tell() SingleStringDict.add_word(word=key, df=postlist.df, post_list_id=offset) try: write_posting_gamma(postlist) except Exception, e: print inverted_index[key] SingleStringDict.compress(filename) def pregamma_handle_didlist(postlinglist): def prehandle_dtlist(plist): pl = sorted(plist)[::-1] for i in range(len(plist) - 1): pl[i] -= pl[i + 1] pl = pl[::-1] # Note: If the first element is 0, the Gamma cannot represent, so +1 pl[0] += 1 return pl resultpl = [] pl = sorted(postlinglist.docitemmap.values(), key=lambda x: x.id) did = pl[0].id
for i in range(0, len(gammadata), 8): if i + 8 < len(gammadata): f.write(struct.pack('B', int(gammadata[i:i + 8], 2))) else: # padding f.write(struct.pack('B', int(gammadata[i:], 2))) # write inverted index into file with open(filename, 'wb') as f: for key, postlist in inverted_index.items(): offset = f.tell() SingleStringDict.add_word(word = key, df = postlist.df, post_list_id = offset) try: write_posting_gamma(postlist) except Exception, e: print inverted_index[key] SingleStringDict.compress(filename) def pregamma_handle_didlist(postlinglist): def prehandle_dtlist(plist): pl = sorted(plist)[::-1] for i in range(len(plist) - 1): pl[i] -= pl[i + 1] pl = pl[::-1] # Note: If the first element is 0, the Gamma cannot represent, so +1 pl[0] += 1 return pl resultpl = [] pl = sorted(postlinglist.docitemmap.values(), key= lambda x: x.id) did = pl[0].id