def openList(termId, getCache=False): if getCache: if is_cached(termId): data = get_cache_data(termId) # Reset the index information. data["current_chunk_index"] = 0 data["current_posting_index"] = 0 return data lexicon_node_obj = lexicon_list[termId] # Open to read the inverted list file. list_file = open(pwd + "inverted_index_new/" + str(lexicon_node_obj.did), "rb") # Seek to the start offset of the inverted list information for this term. list_file.seek(int(lexicon_node_obj.start)) list_data_str = list_file.read(int(lexicon_node_obj.length)) list_posting = { "current_chunk_index": 0, "current_posting_index": 0, "current_freq": 0, "meta_data": [], # The inverted list file name. "did": lexicon_node_obj.did } # print "lexicon_node_obj.start:" + str(lexicon_node_obj.start) # print "lexicon_node_obj.len:" + str(lexicon_node_obj.length) # Decode the meta data information. list_data = decode7bit(list_data_str[:int(lexicon_node_obj.meta_length)]) list_file.close() # print "len(list_data):---" + str(len(list_data)) for i in range(0, len(list_data), 2): if i != 0: # Decode the document id information. list_data[i] += list_data[i - 2] list_posting["meta_data"].append({ # The last document id of this chunk. "did": list_data[i], "chunk_size": list_data[i + 1] }) if True: # if getCache: # Store the string of chunks data into memory. list_posting["chunks_str"] = list_data_str[int(lexicon_node_obj.meta_length):] else: size = 0 chunks_str = list_data_str[int(lexicon_node_obj.meta_length):] for i in range(0, len(list_data), 2): chunk_content = decode7bit(chunks_str[size:size + list_data[i + 1]]) chunk_postings = [] for j in range(0, len(chunk_content), 2): if j != 0: chunk_content[j] += chunk_content[j - 2] elif i != 0: chunk_content[j] += list_data[i - 2] chunk_postings.append({ "did": chunk_content[j], "freq": chunk_content[j + 1] }) list_posting[i / 2] = chunk_postings size += list_data[i + 1] return list_posting
def nextGEQ(list_posting, k_docID): current_chunk_index = int(list_posting["current_chunk_index"]) meta_data = list_posting["meta_data"] current_posting_index = int(list_posting["current_posting_index"]) meta_data_length = len(meta_data) while current_chunk_index < meta_data_length: # The last document id of current chunk. did = meta_data[current_chunk_index]["did"] if did >= k_docID: break current_chunk_index += 1 current_posting_index = 0 if current_chunk_index >= meta_data_length: return max_doc_id if current_chunk_index in list_posting: for j in range(current_posting_index, len(list_posting[current_chunk_index])): next_did = list_posting[current_chunk_index][j]["did"] if next_did >= k_docID: list_posting["current_posting_index"] = j list_posting["current_freq"] = list_posting[ current_chunk_index][j]["freq"] return next_did else: size = 0 for meta_index in range(current_chunk_index): # Calculate the offset of the chunk. size += int(list_posting["meta_data"][meta_index]["chunk_size"]) # Decode the chunk content. chunk_content = decode7bit( list_posting["chunks_str"] [size:meta_data[current_chunk_index]["chunk_size"]]) chunk_postings = [] next_did = -1 for i in range(0, len(chunk_content), 2): if i != 0: # Decode the document id. chunk_content[i] += chunk_content[i - 2] elif current_chunk_index != 0: chunk_content[i] += meta_data[current_chunk_index - 1]["did"] chunk_postings.append({ "did": chunk_content[i], "freq": chunk_content[i + 1] }) if chunk_content[i] >= k_docID and next_did == -1: list_posting["current_posting_index"] = i / 2 list_posting["current_freq"] = chunk_content[i + 1] next_did = chunk_content[i] list_posting[current_chunk_index] = chunk_postings if next_did != -1: list_posting["current_chunk_index"] = current_chunk_index return next_did return max_doc_id
def nextGEQ(list_posting, k_docID): current_chunk_index = int(list_posting["current_chunk_index"]) meta_data = list_posting["meta_data"] current_posting_index = int(list_posting["current_posting_index"]) meta_data_length = len(meta_data) while current_chunk_index < meta_data_length: # The last document id of current chunk. did = meta_data[current_chunk_index]["did"] if did >= k_docID: break current_chunk_index += 1 current_posting_index = 0 if current_chunk_index >= meta_data_length: return max_doc_id if current_chunk_index in list_posting: for j in range(current_posting_index, len(list_posting[current_chunk_index])): next_did = list_posting[current_chunk_index][j]["did"] if next_did >= k_docID: list_posting["current_posting_index"] = j list_posting["current_freq"] = list_posting[current_chunk_index][j]["freq"] return next_did else: size = 0 for meta_index in range(current_chunk_index): # Calculate the offset of the chunk. size += int(list_posting["meta_data"][meta_index]["chunk_size"]) # Decode the chunk content. chunk_content = decode7bit( list_posting["chunks_str"][size:meta_data[current_chunk_index]["chunk_size"]]) chunk_postings = [] next_did = -1 for i in range(0, len(chunk_content), 2): if i != 0: # Decode the document id. chunk_content[i] += chunk_content[i - 2] elif current_chunk_index != 0: chunk_content[i] += meta_data[current_chunk_index - 1]["did"] chunk_postings.append({ "did": chunk_content[i], "freq": chunk_content[i + 1] }) if chunk_content[i] >= k_docID and next_did == -1: list_posting["current_posting_index"] = i / 2 list_posting["current_freq"] = chunk_content[i + 1] next_did = chunk_content[i] list_posting[current_chunk_index] = chunk_postings if next_did != -1: list_posting["current_chunk_index"] = current_chunk_index return next_did return max_doc_id
def openList(termId, getCache=False): if getCache: if is_cached(termId): data = get_cache_data(termId) # Reset the index information. data["current_chunk_index"] = 0 data["current_posting_index"] = 0 return data lexicon_node_obj = lexicon_list[termId] # Open to read the inverted list file. list_file = open(pwd + "inverted_index_new/" + str(lexicon_node_obj.did), "rb") # Seek to the start offset of the inverted list information for this term. list_file.seek(int(lexicon_node_obj.start)) list_data_str = list_file.read(int(lexicon_node_obj.length)) list_posting = { "current_chunk_index": 0, "current_posting_index": 0, "current_freq": 0, "meta_data": [], # The inverted list file name. "did": lexicon_node_obj.did } # print "lexicon_node_obj.start:" + str(lexicon_node_obj.start) # print "lexicon_node_obj.len:" + str(lexicon_node_obj.length) # Decode the meta data information. list_data = decode7bit(list_data_str[:int(lexicon_node_obj.meta_length)]) list_file.close() # print "len(list_data):---" + str(len(list_data)) for i in range(0, len(list_data), 2): if i != 0: # Decode the document id information. list_data[i] += list_data[i - 2] list_posting["meta_data"].append({ # The last document id of this chunk. "did": list_data[i], "chunk_size": list_data[i + 1] }) if True: # if getCache: # Store the string of chunks data into memory. list_posting["chunks_str"] = list_data_str[int(lexicon_node_obj. meta_length):] else: size = 0 chunks_str = list_data_str[int(lexicon_node_obj.meta_length):] for i in range(0, len(list_data), 2): chunk_content = decode7bit(chunks_str[size:size + list_data[i + 1]]) chunk_postings = [] for j in range(0, len(chunk_content), 2): if j != 0: chunk_content[j] += chunk_content[j - 2] elif i != 0: chunk_content[j] += list_data[i - 2] chunk_postings.append({ "did": chunk_content[j], "freq": chunk_content[j + 1] }) list_posting[i / 2] = chunk_postings size += list_data[i + 1] return list_posting