def get_final_index(): print('Compiling final index file') st = timer() lex_file = open('lexicon.dat', 'wb+') in_file = open('index.dat', 'wb+') i = 0 with open('compiled_index.dat', 'r') as fil: cur_word = None arr_freqs = [] arr_doc_ids = [] while True: try: f_line = fil.readline() if not f_line: break if f_line is not '\n': f_line = re.sub("\n|\r", "", f_line) arr = f_line.split(',') if len(arr) >= 3: if cur_word == arr[ 0]: # appending to the same word list doc_ids = list(map(int, arr[1:][0::2])) freqs = list(map(int, arr[1:][1::2])) arr_doc_ids += doc_ids arr_freqs += freqs else: if len( arr_freqs ) > 0: # if its not the first word, add to file st_pos = in_file.tell() d_bytes = in_file.write( vbcode.encode(arr_doc_ids)) f_bytes = in_file.write( vbcode.encode(arr_freqs)) lex_file.write( '{},{},{},{}\n'.format( cur_word, st_pos, d_bytes, f_bytes).encode('ascii') ) # index file seek:start pos, then read d bytes then read f bytes cur_word = arr[0] doc_ids = list(map(int, arr[1:][0::2])) freqs = list(map(int, arr[1:][1::2])) arr_doc_ids = doc_ids arr_freqs = freqs except Exception as e: print(e, i) break i += 1 lex_file.close() in_file.close() end = timer() print('Time Taken: ', end - st)
def encode_posting(args): term, postings = args delta_id = calculate_deltas(map(attrgetter('id'), postings)) encoded_entry = b'' for posting, id in zip(postings, delta_id): # Format: [id, tf, positions len, *positions] decimal_tf = int(posting.weighted_tf * TF_MULTIPLIER) if posting.weighted_tf != 1.0 else 1 encoded_entry += vbcode.encode( [id, decimal_tf, len(posting.positions)]) encoded_entry += vbcode.encode(posting.positions) return term, encoded_entry
def SavetoFile(self): """ save dictionary, postings and skip pointers given fom build_index() to file """ print('saving to file...') # Initialize out files write_dictionary = open(self.dictionary_file, "wb") write_postings = open(self.postings_file, "wb") # Set dictionary with idf values and pointers to postings, pickle postings for key in sorted(self.postings): self.dictionary[key] = write_postings.tell() pickle.dump(self.postings[key][0], write_postings) bs = vbcode.encode(self.postings[key][1]) pickle.dump(bs, write_postings) # bs = vbcode.encode(self.postings[key][2]) pickle.dump(self.postings[key][2], write_postings) for i in range(len(self.postings[key][3])): bs = vbcode.encode(self.postings[key][3][i]) pickle.dump(bs, write_postings) # pickle.dump(self.postings[key], write_postings) # pickle.dump(self.postings[key][0], write_postings) # pickle.dump(self.postings[key][1], write_postings) # pickle.dump(self.postings[key][2], write_postings) # pickle.dump(self.postings[key][3], write_postings) # np.save(write_postings, self.postings[key][0]) # np.save(write_postings, self.postings[key][1]) # np.save(write_postings, self.postings[key][2]) # np.save(write_postings, self.postings[key][3]) # for i in range(len(self.postings[key][3])): # np.save(write_postings, self.postings[key][3][i]) # Pickle dictionary # pickle.dump(self.average, write_dictionary) # pickle.dump(self.total_doc, write_dictionary) # pickle.dump(self.court_field, write_dictionary) pickle.dump(self.outlinks, write_dictionary) pickle.dump(self.date_field, write_dictionary) pickle.dump(self.dictionary, write_dictionary) # Close all files write_dictionary.close() write_postings.close() print('save to file successfully!')
def test_vb_encode(numbers, ok): bytestream = vbcode.encode(numbers) assert ''.join([ format(b, '08b') for b in unpack('%dB' % len(bytestream), bytestream) ]) == ok print("test ok. %s -> %s" % (numbers, ok))
def write_from_freq_offsets(filename, document_freq, term_offsets): with open(filename, 'wb') as f: for term in document_freq: # Format: [term len, document freq, offset, term] encoded_entry = vbcode.encode( [len(term), document_freq[term], term_offsets[term]]) f.write(encoded_entry + bytes(term))
def _save(self): if len(self.ids) > 0: begin_id = self.ids[0] self.begin_ids.append(begin_id) encoded = vbcode.encode(self.ids) start_length_pair = self.write(encoded) self.offsets_id.append(start_length_pair) encoded = vbcode.encode(self.tfs) start_length_pair = self.write(encoded) self.offsets_tf.append(start_length_pair) encoded = vbcode.encode(self.scores) start_length_pair = self.write(encoded) self.offsets_score.append(start_length_pair) self.last_id = 0 self.ids = [] self.tfs = [] self.scores = []
def EncodePositionalIndex(term_dict, type): compressed_positional = {} compressed_amount = 0 for term in term_dict: compressed_positional[term] = {} for docid in term_dict[term]: if type == "gamma": positions = encode_Gamma(docid[1]) else: positions = encode(docid[1]) compressed_amount += sys.getsizeof( docid[1]) - sys.getsizeof(positions) compressed_positional[term].update({docid[0]: positions}) print("reduction of size after compress by", type, ":", compressed_amount) return compressed_positional
def test_vbc(): # format() require python 2.6 or higher. def test_vb_encode(numbers, ok): bytestream = vbcode.encode(numbers) assert ''.join([format(b, '08b') for b in unpack('%dB' % len(bytestream), bytestream)]) == ok print "test ok. %s -> %s" % (numbers, ok) test_vb_encode([1], '10000001') test_vb_encode([5], '10000101') test_vb_encode([127], '11111111') test_vb_encode([128], '00000001' + '10000000') test_vb_encode([129], '00000001' + '10000001') import sys, random for i in xrange(1000): n = random.randint(0, sys.maxint) assert vbcode.decode(vbcode.encode([n]))[0] == n
def test_vbc(): # format() require python 2.6 or higher. def test_vb_encode(numbers, ok): bytestream = vbcode.encode(numbers) assert ''.join([ format(b, '08b') for b in unpack('%dB' % len(bytestream), bytestream) ]) == ok print "test ok. %s -> %s" % (numbers, ok) test_vb_encode([1], '10000001') test_vb_encode([5], '10000101') test_vb_encode([127], '11111111') test_vb_encode([128], '00000001' + '10000000') test_vb_encode([129], '00000001' + '10000001') import sys, random for i in xrange(1000): n = random.randint(0, sys.maxint) assert vbcode.decode(vbcode.encode([n]))[0] == n
dist = math.sqrt(len(values)) jump_table = values[::dist] for i in reversed(xrange(1, len(jump_table))): jump_table[i] -= jump_table[i - 1] for v in values: entries = list(revertIndex[key][v]) for i in reversed(xrange(1, len(entries))): entries[i] = entries[i] - entries[i - 1] list_for_compression.append(len(entries)) list_for_compression.extend(entries) list_for_compression.append(len(jump_table)) list_for_compression.extend(jump_table) compressed = vbcode.encode(list_for_compression) bytearr += struct.pack('I{}s0I'.format(len(compressed)), len(compressed), compressed) revIndexFile.write(bytearr) allDocIds = sorted(forwardIndex.keys()) for docId in allDocIds: title = forwardIndex[docId].title.encode('utf-8') url = forwardIndex[docId].url.encode('utf-8') bytearr = struct.pack('II{}sI{}s0I'.format(len(title), len(url)), docId, len(title), title, len(url), url) forwardIndexFile.write(bytearr)
def create_raw_invert_index(direct_index): raw_invert_index = defaultdict(lambda: defaultdict(lambda: [int, []])) """ raw_invert_index: { word: { doc_id: (кол-во токенов в докумене, [позиции, на которых слово встретиловсь]) # частотсу слова в документе, для расчета idf можно получить как len([позиции]) } } """ dir_name = DIR_WITH_TOKENS step = 0 t = datetime.datetime.now() for doc_id, offset_for_title in direct_index.items(): token = read_direct_index(offset_for_title) with open(f"../{dir_name}/{token}.txt", 'r') as f: tokens_list = json.load(f) for i in range(len(tokens_list)): word_hash = hash_str(tokens_list[i]) raw_invert_index[word_hash][doc_id][0] = len(tokens_list) raw_invert_index[word_hash][doc_id][1].append(i) sys.stdout.write( f"\rОбработано в сырой обратный индекс: {step} / {len(direct_index)}" ) sys.stdout.flush() step += 1 print( f"\nСырой обратный индекс создан. {datetime.datetime.now() - t}.\nНачинаем обработку индекса" ) # Этот каст делается для того, что бы raw_invert_index стал обычным словарем raw_invert_index = dict(raw_invert_index) len_raw_invert_index = len(raw_invert_index) step = 0 t = datetime.datetime.now() offset_in_bin_file = 0 total_files_in_corp = len(direct_index) for key_dict, value_dict in raw_invert_index.items(): IDF = math.log10(total_files_in_corp / len(value_dict)) offset_for_word = offset_in_bin_file # Преобразование doc_id sorted_doc_ids, vbcode_doc_ids = get_vb_code_for_doc_ids( value_dict.keys()) # длина кода vb len_of_vbcode_doc_ids = len(vbcode_doc_ids) # Преобразование [pos_in_file, ...], подсчет частот(freq), а так же подсчет метрики TFIDF freqs = [] tfidf = [] for_write_pos_in_file = b'' for elem in sorted_doc_ids: total_tokens_in_file, pos_in_file = value_dict[elem] current_freq = len(pos_in_file) freqs.append(current_freq) TF = current_freq / total_tokens_in_file TFIDF = TF * IDF tfidf.append(TFIDF) for_write_pos_in_file += vbcode.encode(sorted(pos_in_file)) # длина позиций в документе в vb len_of_vbcode_for_pos_in_files = len(for_write_pos_in_file) # Форматируем частоты в файле frm = str(len(freqs)) + FORMAT_TO_UI write_freq = struct.pack(frm, *freqs) # Формируем TFIDF в файле frm = str(len(tfidf)) + FORMAT_TO_FL write_tfidf = struct.pack(frm, *tfidf) res = vbcode_doc_ids + write_tfidf + write_freq + for_write_pos_in_file total_len = len(res) with open('bin_file', 'ab') as f: f.write(res) raw_invert_index[key_dict] = (offset_for_word, len_of_vbcode_doc_ids, len_of_vbcode_for_pos_in_files) offset_in_bin_file += total_len sys.stdout.write( f"\rСоздание обратного индекс: {step} / {len_raw_invert_index}") sys.stdout.flush() step += 1 print(f"\nОбратный индекс создан. {datetime.datetime.now() - t}") return raw_invert_index
struct.unpack('<{}I'.format(coordsLen), r.read(4 * coordsLen))) entries[docId] = coords return (True, word, entries) fileRevert = sys.argv[1] r = open(fileRevert, 'rb') w = open('{}_compress'.format(fileRevert), 'wb') readyRevertIndex = dict() flag = True while flag == True: flag, word, entries = readPosting(r) if flag == True: keyLength = len(word) values = sorted(list(entries.keys())) bytearr = struct.pack('I{}sI'.format(keyLength), keyLength, word, len(values)) for v in values: ent = entries[v] ent.insert(0, v) bytestream = vbcode.encode(ent) compressed = struct.pack('I{}s0I'.format(len(bytestream)), len(bytestream), bytestream) bytearr += compressed w.write(bytearr)
def compress_using_variable_byte(integers: list): return vbcode.encode(integers).hex()
def variable_encoding(postings): return int.from_bytes(vbcode.encode(get_gaps_list(postings)), byteorder='big')
def test_vb_decode(): n = random.randint(0, sys.maxsize) assert vbcode.decode(vbcode.encode([n]))[0] == n print("test ok. %s -> %s" % (n, n))
def test_vb_encode(numbers, ok): bytestream = vbcode.encode(numbers) assert ''.join([format(b, '08b') for b in unpack('%dB' % len(bytestream), bytestream)]) == ok print("test ok. %s -> %s" % (numbers, ok))
# In[4]: def decode(bytestream): n = 0 numbers = [] bytestream = unpack('%dB' % len(bytestream), bytestream) for byte in bytestream: if byte < 128: n = 128 * n + byte else: n = 128 * n + (byte - 128) numbers.append(n) n = 0 return numbers # In[10]: vbcode.encode([777,6789]) # In[6]: vbcode.decode(b'\x06\x89')