def get_list_for_word(word, pos, details): """ Gets the list for a single word with its corresponding relevant positions :param word: the word to search :param pos: the positions wanted :param details: true if detailed positions also need to be returned :return: the list of documents that contain this word in the wanted positions, and the corresponding list of positions if needed """ res = [] res_pos = [] if len(pos) == 1: pf.seek(postings_base_pointer + dictionary[word]['ptr']) for _ in range(dictionary[word]['df']): res.append(read_doc_id(pf)) tf = read_tf(pf) pp = read_position_pointer(pf) if details: this_pos = [] pf.seek(pp) for _ in range(tf): this_pos.append(read_positional_index(pf)) res_pos.append(this_pos) pf.seek(postings_base_pointer + dictionary[word]['ptr']) for _ in range(dictionary[word]['df']): doc_id = read_doc_id(pf) tf = read_tf(pf) pos_ptr = read_position_pointer(pf) if tf < len(pos): continue pos_reader = open(postings_file, 'rb') pos_reader.seek(pos_ptr) current_positions = queue.Queue() t_id = len(pos) # term id of next to inspect for _ in range(len(pos)): current_positions.put(read_positional_index(pos_reader)) is_valid = False this_pos = [] pos_list = list(current_positions.queue) if is_isomorphic(pos_list, pos): is_valid = True this_pos.append(pos_list[0]) while t_id < tf: if is_valid and not details: break current_positions.get() current_positions.put(read_positional_index(pos_reader)) t_id += 1 pos_list = list(current_positions.queue) if is_isomorphic(pos_list, pos): is_valid = True this_pos.append(pos_list[0]) if is_valid: res.append(doc_id) res_pos.append(this_pos) pos_reader.close() return res, res_pos
def read_position(word, doc_index, index): """ Read the positional index :param word: the word to search for :param doc_index: the document index of the word to search for, not the docID :param index: the i-th appearance of the term :return: the positional index """ _, _, ptr = read_posting(word, doc_index) pf.seek(ptr + pos_byte_width * index) return read_positional_index(pf)
def intersect_word_list(word, docs, pos_lst, pos, details): """ Gets the list of simple intersection of the two given words :param word: the word to search for :param docs: the intermediate list of words :param pos_lst: the intermediate list of position lists :param pos: the relative positions of word and intermediate list :param details: true if the details of positions of each phrase needed is needed to be returned :return: the list of documents that contain the two words in the wanted relative positions, and the corresponding list of lists of positions if needed """ res = [] res_pos = [] # [w1, w2] for the following doc_reader = open(postings_file, 'rb') base_pointer = postings_base_pointer + dictionary[word]['ptr'] doc_reader.seek(base_pointer) if not docs: return [], [] doc_id = [read_doc_id(doc_reader), docs[0]] term_freq = [read_tf(doc_reader), len(pos_lst[0])] pos_pointer = read_position_pointer(doc_reader) doc_count = [1, 1] # count = next index to inspect doc_freq = [dictionary[word]['df'], len(docs)] docs.append(-1) pos_lst.append([]) skip_width = [math.floor(math.sqrt(doc_freq[0])), math.floor(math.sqrt(doc_freq[1]))] while doc_count[0] <= doc_freq[0] and doc_count[1] <= doc_freq[1]: found = doc_id[0] == doc_id[1] if doc_id[0] < doc_id[1]: if skip_width[0] > 1: while doc_count[0] + skip_width[0] < doc_freq[0]: d_id, _, _ = read_posting(word, doc_count[0] + skip_width[0]) if d_id > doc_id[1]: break doc_count[0] += skip_width[0] doc_reader.seek(base_pointer + doc_byte_width * (doc_count[0] - 1)) for _ in range(skip_width[0] + 1): d_id = read_doc_id(doc_reader) tf = read_tf(doc_reader) pp = read_position_pointer(doc_reader) doc_count[0] += 1 if d_id >= doc_id[1]: found = d_id == doc_id[1] doc_id[0] = d_id term_freq[0] = tf pos_pointer = pp break if doc_count[0] >= doc_freq[0]: break elif doc_id[0] > doc_id[1]: if skip_width[1] > 1: while doc_count[1] + skip_width[1] < doc_freq[1]: d_id = docs[doc_count[1] + skip_width[1]] if d_id > doc_id[0]: break doc_count[1] += skip_width[1] for _ in range(skip_width[1] + 1): d_id = docs[doc_count[1]] tf = len(pos_lst[doc_count[1]]) doc_count[1] += 1 if d_id >= doc_id[0]: found = d_id == doc_id[0] doc_id[1] = d_id term_freq[1] = tf break if doc_count[1] >= doc_freq[1]: break if found: pos_reader = open(postings_file, 'rb') pos_reader.seek(pos_pointer) position_list = pos_lst[doc_count[1] - 1] position_list.append(-1) position = [read_positional_index(pos_reader), position_list[0]] skip_pos_width = [math.floor(math.sqrt(term_freq[0])), math.floor(math.sqrt(term_freq[1]))] pos_count = [1, 1] found2 = (position[1] - position[0] == pos[1] - pos[0]) this_pos = [] if found2: this_pos.append(position[0]) while pos_count[0] <= term_freq[0] and pos_count[1] <= term_freq[1] and (details or not found2): if position[1] - position[0] > pos[1] - pos[0]: if skip_pos_width[0] > 1: while pos_count[0] + skip_pos_width[0] < term_freq[0]: p_id = read_position(word, doc_count[0], pos_count[0] + skip_pos_width[0]) if position[1] - p_id < pos[1] - pos[0]: break pos_count[0] += skip_pos_width[0] pos_reader.seek(pos_pointer + pos_byte_width * (pos_count[0] - 1)) for _ in range(skip_pos_width[0] + 1): p_id = read_positional_index(pos_reader) pos_count[0] += 1 if position[1] - p_id <= pos[1] - pos[0]: matches = position[1] - p_id == pos[1] - pos[0] found2 = matches or found2 position[0] = p_id if matches: this_pos.append(position[0]) break if pos_count[0] >= term_freq[0]: break elif position[1] - position[0] < pos[1] - pos[0]: if skip_pos_width[1] > 1: while pos_count[1] + skip_pos_width[1] < term_freq[1]: p_id = position_list[pos_count[1] + skip_pos_width[1]] if position[0] - p_id < pos[0] - pos[1]: break pos_count[1] += skip_pos_width[1] for _ in range(skip_pos_width[1] + 1): p_id = position_list[pos_count[1]] pos_count[1] += 1 if position[0] - p_id <= pos[0] - pos[1]: matches = position[0] - p_id == pos[0] - pos[1] found2 = matches or found2 position[1] = p_id if matches: this_pos.append(position[0]) break if pos_count[1] >= term_freq[1]: break if found2: res.append(doc_id[0]) res_pos.append(this_pos) pos_reader.close() doc_id[0] = read_doc_id(doc_reader) term_freq[0] = read_tf(doc_reader) pos_pointer = read_position_pointer(doc_reader) doc_count[0] += 1 doc_id[1] = docs[doc_count[1]] term_freq[1] = len(pos_lst[doc_count[1]]) doc_count[1] += 1 doc_reader.close() return res, res_pos
def intersect_words(w1, w2, pos, details): """ Gets the list of simple intersection of the two given words :param w1: the first word to search for :param w2: the second word to search for :param pos: the relative positions of w1 and w2 :param details: true if the details of positions of each phrase needed is needed to be returned :return: the list of documents that contain the two words in the wanted relative positions, and the corresponding list of lists of positions if needed """ res = [] res_pos = [] # [w1, w2] for the following w = [w1, w2] doc_reader = [open(postings_file, 'rb'), open(postings_file, 'rb')] base_pointer = [postings_base_pointer + dictionary[w1]['ptr'], postings_base_pointer + dictionary[w2]['ptr']] doc_reader[0].seek(base_pointer[0]) doc_reader[1].seek(base_pointer[1]) doc_id = [read_doc_id(doc_reader[0]), read_doc_id(doc_reader[1])] term_freq = [read_tf(doc_reader[0]), read_tf(doc_reader[1])] pos_pointer = [read_position_pointer(doc_reader[0]), read_position_pointer(doc_reader[1])] doc_count = [1, 1] # count = next index to inspect doc_freq = [dictionary[w1]['df'], dictionary[w2]['df']] skip_width = [math.floor(math.sqrt(doc_freq[0])), math.floor(math.sqrt(doc_freq[1]))] while doc_count[0] <= doc_freq[0] and doc_count[1] <= doc_freq[1]: found = doc_id[0] == doc_id[1] if doc_id[0] != doc_id[1]: smaller_index = 0 if doc_id[0] > doc_id[1]: smaller_index = 1 other_index = 1 - smaller_index if skip_width[smaller_index] > 1: while doc_count[smaller_index] + skip_width[smaller_index] < doc_freq[smaller_index]: d_id, _, _ = read_posting(w[smaller_index], doc_count[smaller_index] + skip_width[smaller_index]) if d_id > doc_id[other_index]: break doc_count[smaller_index] += skip_width[smaller_index] doc_reader[smaller_index].seek(base_pointer[smaller_index] + doc_byte_width * (doc_count[smaller_index] - 1)) for _ in range(skip_width[smaller_index] + 1): d_id = read_doc_id(doc_reader[smaller_index]) tf = read_tf(doc_reader[smaller_index]) pp = read_position_pointer(doc_reader[smaller_index]) doc_count[smaller_index] += 1 if d_id >= doc_id[other_index]: found = d_id == doc_id[other_index] doc_id[smaller_index] = d_id term_freq[smaller_index] = tf pos_pointer[smaller_index] = pp break if doc_count[smaller_index] >= doc_freq[smaller_index]: break if found: diff = [pos[1] - pos[0], pos[0] - pos[1]] # diff = other - this pos_reader = [open(postings_file, 'rb'), open(postings_file, 'rb')] pos_reader[0].seek(pos_pointer[0]) pos_reader[1].seek(pos_pointer[1]) position = [read_positional_index(pos_reader[0]), read_positional_index(pos_reader[1])] skip_pos_width = [math.floor(math.sqrt(term_freq[0])), math.floor(math.sqrt(term_freq[1]))] pos_count = [1, 1] found2 = (position[1] - position[0] == diff[0]) this_pos = [] if found2: this_pos.append(position[0]) while pos_count[0] <= term_freq[0] and pos_count[1] <= term_freq[1] and (details or not found2): smaller_index = 0 if position[1] - position[0] < diff[0]: smaller_index = 1 other_index = 1 - smaller_index if skip_pos_width[smaller_index] > 1: while pos_count[smaller_index] + skip_pos_width[smaller_index] < term_freq[smaller_index]: pf.seek(pos_pointer[smaller_index] + (pos_count[smaller_index] + skip_pos_width[smaller_index]) * pos_byte_width) p_id = read_positional_index(pf) if position[other_index] - p_id < diff[smaller_index]: break pos_count[smaller_index] += skip_pos_width[smaller_index] pos_reader[smaller_index].seek(pos_pointer[smaller_index] + pos_byte_width * (pos_count[smaller_index] - 1)) for _ in range(skip_pos_width[smaller_index] + 1): p_id = read_positional_index(pos_reader[smaller_index]) pos_count[smaller_index] += 1 if position[other_index] - p_id <= diff[smaller_index]: matches = position[other_index] - p_id == diff[smaller_index] found2 = matches or found2 position[smaller_index] = p_id if matches: this_pos.append(position[0]) break if pos_count[smaller_index] >= term_freq[smaller_index]: break if found2: res.append(doc_id[0]) res_pos.append(this_pos) pos_reader[0].close() pos_reader[1].close() for i in range(1): doc_id[i] = read_doc_id(doc_reader[i]) term_freq[i] = read_tf(doc_reader[i]) pos_pointer[i] = read_position_pointer(doc_reader[i]) doc_count[i] += 1 doc_reader[0].close() doc_reader[1].close() return res, res_pos
def merge_blocks(block_count, out_dict, out_postings): """ Merges the blocks to write the overall postings file :param block_count: the number of blocks :param out_dict: the output dictionary file :param out_postings: the output postings file """ # Load dictionaries and open files index_list = list(range(block_count)) dictionaries = [] posting_files = [] positions_files = [] lengths_files = [] for i in index_list: df = open(f'dictionary{i}.txt', 'r') df.readline() dictionary = {} load_dict(df, dictionary) dictionaries.append(dictionary) posting_files.append(open(f'postings{i}.txt', 'rb')) positions_files.append(open(f'positions{i}.txt', 'rb')) lengths_files.append(open(f'lengths{i}.txt', 'rb')) # Prepare to write files # dict_writer = open(out_dict, 'a') post_writer = open(out_postings, 'wb') # Print positions leading_terms = [] dictionary_iters = [] pos_pointers = {} pointer = 0 setup_iters(dictionaries, dictionary_iters, index_list, leading_terms) while leading_terms: leading_term = heappop(leading_terms) update_leading_term(dictionary_iters, leading_term, leading_terms) word = leading_term[0] collections = [leading_term] while leading_terms and leading_terms[0][0] == word: term_block_info = heappop(leading_terms) collections.append(term_block_info) update_leading_term(dictionary_iters, term_block_info, leading_terms) pos_pointers[word] = {} for block in collections: block_index = block[1] df = dictionaries[block_index][word]['df'] ptr = dictionaries[block_index][word]['ptr'] pf = posting_files[block_index] posf = positions_files[block_index] pf.seek(ptr) for _ in range(df): doc = read_doc_id(pf) tf = read_tf(pf) pp = read_position_pointer(pf) posf.seek(pp) for _ in range(tf): pos = read_positional_index(posf) write_int_bin_file(post_writer, pos, pos_byte_width) pos_pointers[word][doc] = pointer pointer += tf * pos_byte_width # Print postings dictionary_iters = [] post_pointers = {} doc_freq = {} postings_base_pointer = pointer pointer = 0 setup_iters(dictionaries, dictionary_iters, index_list, leading_terms) while leading_terms: leading_term = heappop(leading_terms) update_leading_term(dictionary_iters, leading_term, leading_terms) word = leading_term[0] post_pointers[word] = pointer doc_freq[word] = 0 collections = [leading_term] while leading_terms and leading_terms[0][0] == word: term_block_info = heappop(leading_terms) collections.append(term_block_info) update_leading_term(dictionary_iters, term_block_info, leading_terms) for block in collections: block_index = block[1] df = dictionaries[block_index][word]['df'] ptr = dictionaries[block_index][word]['ptr'] pf = posting_files[block_index] pf.seek(ptr) doc_freq[word] += df for _ in range(df): doc = read_doc_id(pf) tf = read_tf(pf) pf.read(pos_pointer_byte_width) write_int_bin_file(post_writer, doc, post_byte_width) write_int_bin_file(post_writer, tf, tf_byte_width) write_int_bin_file(post_writer, pos_pointers[word][doc], pos_pointer_byte_width) pointer += doc_byte_width # Print dictionary lengths_base_pointer = pointer + postings_base_pointer dictionary_iters = [] setup_iters(dictionaries, dictionary_iters, index_list, leading_terms) dict_writer = open(out_dict, 'a') dict_writer.write(f'{postings_base_pointer} {lengths_base_pointer}\n') dict_writer.close() while leading_terms: leading_term = heappop(leading_terms) update_leading_term(dictionary_iters, leading_term, leading_terms) word = leading_term[0] while leading_terms and leading_terms[0][0] == word: term_block_info = heappop(leading_terms) update_leading_term(dictionary_iters, term_block_info, leading_terms) dict_writer = open(out_dict, 'a') dict_writer.write(f'{word} {doc_freq[word]} {post_pointers[word]}\n') dict_writer.close() # Print lengths for i in index_list: lf = lengths_files[i] for _ in range(block_size): doc = read_doc_id(lf) length = read_float_bin_file(lf) if not length: break write_int_bin_file(post_writer, doc, post_byte_width) write_float_bin_file(post_writer, length) # Close files # dict_writer.close() post_writer.close() for f in posting_files: f.close() for f in positions_files: f.close() for f in lengths_files: f.close() """