def get_final_index():
    print('Compiling final index file')
    st = timer()
    lex_file = open('lexicon.dat', 'wb+')
    in_file = open('index.dat', 'wb+')
    i = 0
    with open('compiled_index.dat', 'r') as fil:
        cur_word = None
        arr_freqs = []
        arr_doc_ids = []
        while True:
            try:
                f_line = fil.readline()
                if not f_line:
                    break
                if f_line is not '\n':
                    f_line = re.sub("\n|\r", "", f_line)
                    arr = f_line.split(',')
                    if len(arr) >= 3:

                        if cur_word == arr[
                                0]:  # appending to the same word list

                            doc_ids = list(map(int, arr[1:][0::2]))
                            freqs = list(map(int, arr[1:][1::2]))
                            arr_doc_ids += doc_ids
                            arr_freqs += freqs
                        else:
                            if len(
                                    arr_freqs
                            ) > 0:  # if its not the first word, add to file
                                st_pos = in_file.tell()
                                d_bytes = in_file.write(
                                    vbcode.encode(arr_doc_ids))
                                f_bytes = in_file.write(
                                    vbcode.encode(arr_freqs))
                                lex_file.write(
                                    '{},{},{},{}\n'.format(
                                        cur_word, st_pos, d_bytes,
                                        f_bytes).encode('ascii')
                                )  # index file seek:start pos, then read d bytes then read f bytes
                            cur_word = arr[0]
                            doc_ids = list(map(int, arr[1:][0::2]))
                            freqs = list(map(int, arr[1:][1::2]))
                            arr_doc_ids = doc_ids
                            arr_freqs = freqs
            except Exception as e:
                print(e, i)
                break
            i += 1
    lex_file.close()
    in_file.close()
    end = timer()
    print('Time Taken: ', end - st)
Example #2
0
def encode_posting(args):
    term, postings = args
    delta_id = calculate_deltas(map(attrgetter('id'), postings))
    encoded_entry = b''

    for posting, id in zip(postings, delta_id):
        # Format: [id, tf, positions len, *positions]
        decimal_tf = int(posting.weighted_tf *
                         TF_MULTIPLIER) if posting.weighted_tf != 1.0 else 1
        encoded_entry += vbcode.encode(
            [id, decimal_tf, len(posting.positions)])
        encoded_entry += vbcode.encode(posting.positions)

    return term, encoded_entry
Example #3
0
    def SavetoFile(self):
        """
        save dictionary, postings and skip pointers given fom build_index() to file
        """

        print('saving to file...')

        # Initialize out files
        write_dictionary = open(self.dictionary_file, "wb")
        write_postings = open(self.postings_file, "wb")

        # Set dictionary with idf values and pointers to postings, pickle postings
        for key in sorted(self.postings):
            self.dictionary[key] = write_postings.tell()
            pickle.dump(self.postings[key][0], write_postings)

            bs = vbcode.encode(self.postings[key][1])
            pickle.dump(bs, write_postings)

            # bs = vbcode.encode(self.postings[key][2])
            pickle.dump(self.postings[key][2], write_postings)
            for i in range(len(self.postings[key][3])):
                bs = vbcode.encode(self.postings[key][3][i])
                pickle.dump(bs, write_postings)

            # pickle.dump(self.postings[key], write_postings)
            # pickle.dump(self.postings[key][0], write_postings)
            # pickle.dump(self.postings[key][1], write_postings)
            # pickle.dump(self.postings[key][2], write_postings)
            # pickle.dump(self.postings[key][3], write_postings)
            # np.save(write_postings, self.postings[key][0])
            # np.save(write_postings, self.postings[key][1])
            # np.save(write_postings, self.postings[key][2])
            # np.save(write_postings, self.postings[key][3])
            # for i in range(len(self.postings[key][3])):
            #     np.save(write_postings, self.postings[key][3][i])

        # Pickle dictionary
        # pickle.dump(self.average, write_dictionary)
        # pickle.dump(self.total_doc, write_dictionary)
        # pickle.dump(self.court_field, write_dictionary)
        pickle.dump(self.outlinks, write_dictionary)
        pickle.dump(self.date_field, write_dictionary)
        pickle.dump(self.dictionary, write_dictionary)

        # Close all files
        write_dictionary.close()
        write_postings.close()

        print('save to file successfully!')
Example #4
0
 def test_vb_encode(numbers, ok):
     bytestream = vbcode.encode(numbers)
     assert ''.join([
         format(b, '08b')
         for b in unpack('%dB' % len(bytestream), bytestream)
     ]) == ok
     print("test ok. %s -> %s" % (numbers, ok))
Example #5
0
 def write_from_freq_offsets(filename, document_freq, term_offsets):
     with open(filename, 'wb') as f:
         for term in document_freq:
             # Format: [term len, document freq, offset, term]
             encoded_entry = vbcode.encode(
                 [len(term), document_freq[term], term_offsets[term]])
             f.write(encoded_entry + bytes(term))
Example #6
0
    def _save(self):
        if len(self.ids) > 0:
            begin_id = self.ids[0]
            self.begin_ids.append(begin_id)

            encoded = vbcode.encode(self.ids)
            start_length_pair = self.write(encoded)
            self.offsets_id.append(start_length_pair)

            encoded = vbcode.encode(self.tfs)
            start_length_pair = self.write(encoded)
            self.offsets_tf.append(start_length_pair)

            encoded = vbcode.encode(self.scores)
            start_length_pair = self.write(encoded)
            self.offsets_score.append(start_length_pair)

        self.last_id = 0
        self.ids = []
        self.tfs = []
        self.scores = []
Example #7
0
def EncodePositionalIndex(term_dict, type):
    compressed_positional = {}
    compressed_amount = 0
    for term in term_dict:
        compressed_positional[term] = {}
        for docid in term_dict[term]:
            if type == "gamma":
                positions = encode_Gamma(docid[1])
            else:
                positions = encode(docid[1])
            compressed_amount += sys.getsizeof(
                docid[1]) - sys.getsizeof(positions)
            compressed_positional[term].update({docid[0]: positions})
    print("reduction of size after compress by", type, ":", compressed_amount)
    return compressed_positional
Example #8
0
def test_vbc():
    # format() require python 2.6 or higher.
    def test_vb_encode(numbers, ok):
        bytestream = vbcode.encode(numbers)
        assert ''.join([format(b, '08b') for b in unpack('%dB' % len(bytestream), bytestream)]) == ok
        print "test ok. %s -> %s" % (numbers, ok)
    
    test_vb_encode([1],   '10000001')
    test_vb_encode([5],   '10000101')
    test_vb_encode([127], '11111111')
    test_vb_encode([128], '00000001' + '10000000')
    test_vb_encode([129], '00000001' + '10000001')
    
    import sys, random
    for i in xrange(1000):
        n = random.randint(0, sys.maxint)
        assert vbcode.decode(vbcode.encode([n]))[0] == n
Example #9
0
def test_vbc():
    # format() require python 2.6 or higher.
    def test_vb_encode(numbers, ok):
        bytestream = vbcode.encode(numbers)
        assert ''.join([
            format(b, '08b')
            for b in unpack('%dB' % len(bytestream), bytestream)
        ]) == ok
        print "test ok. %s -> %s" % (numbers, ok)

    test_vb_encode([1], '10000001')
    test_vb_encode([5], '10000101')
    test_vb_encode([127], '11111111')
    test_vb_encode([128], '00000001' + '10000000')
    test_vb_encode([129], '00000001' + '10000001')

    import sys, random
    for i in xrange(1000):
        n = random.randint(0, sys.maxint)
        assert vbcode.decode(vbcode.encode([n]))[0] == n
Example #10
0
    dist = math.sqrt(len(values))
    jump_table = values[::dist]
    for i in reversed(xrange(1, len(jump_table))):
        jump_table[i] -= jump_table[i - 1]

    for v in values:
        entries = list(revertIndex[key][v])
        for i in reversed(xrange(1, len(entries))):
            entries[i] = entries[i] - entries[i - 1]
        list_for_compression.append(len(entries))
        list_for_compression.extend(entries)

    list_for_compression.append(len(jump_table))
    list_for_compression.extend(jump_table)

    compressed = vbcode.encode(list_for_compression)

    bytearr += struct.pack('I{}s0I'.format(len(compressed)), len(compressed),
                           compressed)

    revIndexFile.write(bytearr)

allDocIds = sorted(forwardIndex.keys())

for docId in allDocIds:
    title = forwardIndex[docId].title.encode('utf-8')
    url = forwardIndex[docId].url.encode('utf-8')
    bytearr = struct.pack('II{}sI{}s0I'.format(len(title), len(url)), docId,
                          len(title), title, len(url), url)
    forwardIndexFile.write(bytearr)
Example #11
0
def create_raw_invert_index(direct_index):
    raw_invert_index = defaultdict(lambda: defaultdict(lambda: [int, []]))
    """
    raw_invert_index:
    {
        word: 
            {
                doc_id: (кол-во токенов в докумене, [позиции, на которых слово встретиловсь])
                # частотсу слова в документе, для расчета idf можно получить как len([позиции])
            }
    }
    """
    dir_name = DIR_WITH_TOKENS

    step = 0
    t = datetime.datetime.now()

    for doc_id, offset_for_title in direct_index.items():
        token = read_direct_index(offset_for_title)
        with open(f"../{dir_name}/{token}.txt", 'r') as f:
            tokens_list = json.load(f)

            for i in range(len(tokens_list)):
                word_hash = hash_str(tokens_list[i])
                raw_invert_index[word_hash][doc_id][0] = len(tokens_list)
                raw_invert_index[word_hash][doc_id][1].append(i)

            sys.stdout.write(
                f"\rОбработано в сырой обратный индекс: {step} / {len(direct_index)}"
            )
            sys.stdout.flush()
            step += 1

    print(
        f"\nСырой обратный индекс создан. {datetime.datetime.now() - t}.\nНачинаем обработку индекса"
    )

    # Этот каст делается для того, что бы raw_invert_index стал обычным словарем
    raw_invert_index = dict(raw_invert_index)
    len_raw_invert_index = len(raw_invert_index)
    step = 0
    t = datetime.datetime.now()

    offset_in_bin_file = 0

    total_files_in_corp = len(direct_index)

    for key_dict, value_dict in raw_invert_index.items():

        IDF = math.log10(total_files_in_corp / len(value_dict))

        offset_for_word = offset_in_bin_file

        # Преобразование doc_id
        sorted_doc_ids, vbcode_doc_ids = get_vb_code_for_doc_ids(
            value_dict.keys())
        # длина кода vb
        len_of_vbcode_doc_ids = len(vbcode_doc_ids)

        # Преобразование [pos_in_file, ...], подсчет частот(freq), а так же подсчет метрики TFIDF
        freqs = []
        tfidf = []
        for_write_pos_in_file = b''
        for elem in sorted_doc_ids:
            total_tokens_in_file, pos_in_file = value_dict[elem]

            current_freq = len(pos_in_file)
            freqs.append(current_freq)

            TF = current_freq / total_tokens_in_file
            TFIDF = TF * IDF
            tfidf.append(TFIDF)

            for_write_pos_in_file += vbcode.encode(sorted(pos_in_file))
        # длина позиций в документе в vb
        len_of_vbcode_for_pos_in_files = len(for_write_pos_in_file)

        # Форматируем частоты в файле
        frm = str(len(freqs)) + FORMAT_TO_UI
        write_freq = struct.pack(frm, *freqs)

        # Формируем TFIDF в файле
        frm = str(len(tfidf)) + FORMAT_TO_FL
        write_tfidf = struct.pack(frm, *tfidf)

        res = vbcode_doc_ids + write_tfidf + write_freq + for_write_pos_in_file
        total_len = len(res)
        with open('bin_file', 'ab') as f:
            f.write(res)

        raw_invert_index[key_dict] = (offset_for_word, len_of_vbcode_doc_ids,
                                      len_of_vbcode_for_pos_in_files)
        offset_in_bin_file += total_len

        sys.stdout.write(
            f"\rСоздание обратного индекс: {step} / {len_raw_invert_index}")
        sys.stdout.flush()
        step += 1

    print(f"\nОбратный индекс создан. {datetime.datetime.now() - t}")
    return raw_invert_index
Example #12
0
            struct.unpack('<{}I'.format(coordsLen), r.read(4 * coordsLen)))
        entries[docId] = coords

    return (True, word, entries)


fileRevert = sys.argv[1]

r = open(fileRevert, 'rb')
w = open('{}_compress'.format(fileRevert), 'wb')

readyRevertIndex = dict()

flag = True
while flag == True:
    flag, word, entries = readPosting(r)
    if flag == True:
        keyLength = len(word)
        values = sorted(list(entries.keys()))
        bytearr = struct.pack('I{}sI'.format(keyLength), keyLength, word,
                              len(values))
        for v in values:
            ent = entries[v]
            ent.insert(0, v)

            bytestream = vbcode.encode(ent)
            compressed = struct.pack('I{}s0I'.format(len(bytestream)),
                                     len(bytestream), bytestream)
            bytearr += compressed

        w.write(bytearr)
Example #13
0
def compress_using_variable_byte(integers: list):
    return vbcode.encode(integers).hex()
Example #14
0
def variable_encoding(postings):
    return int.from_bytes(vbcode.encode(get_gaps_list(postings)),
                          byteorder='big')
Example #15
0
 def test_vb_decode():
     n = random.randint(0, sys.maxsize)
     assert vbcode.decode(vbcode.encode([n]))[0] == n
     print("test ok. %s -> %s" % (n, n))
Example #16
0
 def test_vb_decode():    
     n = random.randint(0, sys.maxsize)
     assert vbcode.decode(vbcode.encode([n]))[0] == n
     print("test ok. %s -> %s" % (n, n))
Example #17
0
 def test_vb_encode(numbers, ok):
     bytestream = vbcode.encode(numbers)
     assert ''.join([format(b, '08b') for b in unpack('%dB' % len(bytestream), bytestream)]) == ok
     print("test ok. %s -> %s" % (numbers, ok))
Example #18
0

# In[4]:


def decode(bytestream):
    n = 0
    numbers = []
    bytestream = unpack('%dB' % len(bytestream), bytestream)
    for byte in bytestream:
        if byte < 128:
            n = 128 * n + byte
        else:
            n = 128 * n + (byte - 128)
            numbers.append(n)
            n = 0
    return numbers


# In[10]:


vbcode.encode([777,6789])


# In[6]:


vbcode.decode(b'\x06\x89')