Ejemplo n.º 1
0
def openList(termId, getCache=False):
    if getCache:
        if is_cached(termId):
            data = get_cache_data(termId)
            # Reset the index information.
            data["current_chunk_index"] = 0
            data["current_posting_index"] = 0
            return data
    lexicon_node_obj = lexicon_list[termId]
    # Open to read the inverted list file.
    list_file = open(pwd + "inverted_index_new/" + str(lexicon_node_obj.did), "rb")
    # Seek to the start offset of the inverted list information for this term.
    list_file.seek(int(lexicon_node_obj.start))
    list_data_str = list_file.read(int(lexicon_node_obj.length))
    list_posting = {
        "current_chunk_index": 0,
        "current_posting_index": 0,
        "current_freq": 0,
        "meta_data": [],
        # The inverted list file name.
        "did": lexicon_node_obj.did
    }
#    print "lexicon_node_obj.start:" + str(lexicon_node_obj.start)
#    print "lexicon_node_obj.len:" + str(lexicon_node_obj.length)
    # Decode the meta data information.
    list_data = decode7bit(list_data_str[:int(lexicon_node_obj.meta_length)])
    list_file.close()
    # print "len(list_data):---" + str(len(list_data))
    for i in range(0, len(list_data), 2):
        if i != 0:
            # Decode the document id information.
            list_data[i] += list_data[i - 2]
        list_posting["meta_data"].append({
            # The last document id of this chunk.
            "did": list_data[i],
            "chunk_size": list_data[i + 1]
        })

    if True:
    # if getCache:
        # Store the string of chunks data into memory.
        list_posting["chunks_str"] = list_data_str[int(lexicon_node_obj.meta_length):]
    else:
        size = 0
        chunks_str = list_data_str[int(lexicon_node_obj.meta_length):]
        for i in range(0, len(list_data), 2):
            chunk_content = decode7bit(chunks_str[size:size + list_data[i + 1]])
            chunk_postings = []
            for j in range(0, len(chunk_content), 2):
                if j != 0:
                    chunk_content[j] += chunk_content[j - 2]
                elif i != 0:
                    chunk_content[j] += list_data[i - 2]
                chunk_postings.append({
                    "did": chunk_content[j],
                    "freq": chunk_content[j + 1]
                })
            list_posting[i / 2] = chunk_postings
            size += list_data[i + 1]
    return list_posting
Ejemplo n.º 2
0
def nextGEQ(list_posting, k_docID):
    current_chunk_index = int(list_posting["current_chunk_index"])
    meta_data = list_posting["meta_data"]
    current_posting_index = int(list_posting["current_posting_index"])
    meta_data_length = len(meta_data)
    while current_chunk_index < meta_data_length:
        # The last document id of current chunk.
        did = meta_data[current_chunk_index]["did"]
        if did >= k_docID:
            break
        current_chunk_index += 1
        current_posting_index = 0
    if current_chunk_index >= meta_data_length:
        return max_doc_id
    if current_chunk_index in list_posting:
        for j in range(current_posting_index,
                       len(list_posting[current_chunk_index])):
            next_did = list_posting[current_chunk_index][j]["did"]
            if next_did >= k_docID:
                list_posting["current_posting_index"] = j
                list_posting["current_freq"] = list_posting[
                    current_chunk_index][j]["freq"]
                return next_did
    else:
        size = 0
        for meta_index in range(current_chunk_index):
            # Calculate the offset of the chunk.
            size += int(list_posting["meta_data"][meta_index]["chunk_size"])
            # Decode the chunk content.
        chunk_content = decode7bit(
            list_posting["chunks_str"]
            [size:meta_data[current_chunk_index]["chunk_size"]])
        chunk_postings = []
        next_did = -1
        for i in range(0, len(chunk_content), 2):
            if i != 0:
                # Decode the document id.
                chunk_content[i] += chunk_content[i - 2]
            elif current_chunk_index != 0:
                chunk_content[i] += meta_data[current_chunk_index - 1]["did"]
            chunk_postings.append({
                "did": chunk_content[i],
                "freq": chunk_content[i + 1]
            })
            if chunk_content[i] >= k_docID and next_did == -1:
                list_posting["current_posting_index"] = i / 2
                list_posting["current_freq"] = chunk_content[i + 1]
                next_did = chunk_content[i]
        list_posting[current_chunk_index] = chunk_postings
        if next_did != -1:
            list_posting["current_chunk_index"] = current_chunk_index
            return next_did
    return max_doc_id
Ejemplo n.º 3
0
def nextGEQ(list_posting, k_docID):
    current_chunk_index = int(list_posting["current_chunk_index"])
    meta_data = list_posting["meta_data"]
    current_posting_index = int(list_posting["current_posting_index"])
    meta_data_length = len(meta_data)
    while current_chunk_index < meta_data_length:
        # The last document id of current chunk.
        did = meta_data[current_chunk_index]["did"]
        if did >= k_docID:
            break
        current_chunk_index += 1
        current_posting_index = 0
    if current_chunk_index >= meta_data_length:
        return max_doc_id
    if current_chunk_index in list_posting:
        for j in range(current_posting_index, len(list_posting[current_chunk_index])):
            next_did = list_posting[current_chunk_index][j]["did"]
            if next_did >= k_docID:
                list_posting["current_posting_index"] = j
                list_posting["current_freq"] = list_posting[current_chunk_index][j]["freq"]
                return next_did
    else:
        size = 0
        for meta_index in range(current_chunk_index):
            # Calculate the offset of the chunk.
            size += int(list_posting["meta_data"][meta_index]["chunk_size"])
            # Decode the chunk content.
        chunk_content = decode7bit(
            list_posting["chunks_str"][size:meta_data[current_chunk_index]["chunk_size"]])
        chunk_postings = []
        next_did = -1
        for i in range(0, len(chunk_content), 2):
            if i != 0:
                # Decode the document id.
                chunk_content[i] += chunk_content[i - 2]
            elif current_chunk_index != 0:
                chunk_content[i] += meta_data[current_chunk_index - 1]["did"]
            chunk_postings.append({
                "did": chunk_content[i],
                "freq": chunk_content[i + 1]
            })
            if chunk_content[i] >= k_docID and next_did == -1:
                list_posting["current_posting_index"] = i / 2
                list_posting["current_freq"] = chunk_content[i + 1]
                next_did = chunk_content[i]
        list_posting[current_chunk_index] = chunk_postings
        if next_did != -1:
            list_posting["current_chunk_index"] = current_chunk_index
            return next_did
    return max_doc_id
Ejemplo n.º 4
0
def openList(termId, getCache=False):
    if getCache:
        if is_cached(termId):
            data = get_cache_data(termId)
            # Reset the index information.
            data["current_chunk_index"] = 0
            data["current_posting_index"] = 0
            return data
    lexicon_node_obj = lexicon_list[termId]
    # Open to read the inverted list file.
    list_file = open(pwd + "inverted_index_new/" + str(lexicon_node_obj.did),
                     "rb")
    # Seek to the start offset of the inverted list information for this term.
    list_file.seek(int(lexicon_node_obj.start))
    list_data_str = list_file.read(int(lexicon_node_obj.length))
    list_posting = {
        "current_chunk_index": 0,
        "current_posting_index": 0,
        "current_freq": 0,
        "meta_data": [],
        # The inverted list file name.
        "did": lexicon_node_obj.did
    }
    #    print "lexicon_node_obj.start:" + str(lexicon_node_obj.start)
    #    print "lexicon_node_obj.len:" + str(lexicon_node_obj.length)
    # Decode the meta data information.
    list_data = decode7bit(list_data_str[:int(lexicon_node_obj.meta_length)])
    list_file.close()
    # print "len(list_data):---" + str(len(list_data))
    for i in range(0, len(list_data), 2):
        if i != 0:
            # Decode the document id information.
            list_data[i] += list_data[i - 2]
        list_posting["meta_data"].append({
            # The last document id of this chunk.
            "did": list_data[i],
            "chunk_size": list_data[i + 1]
        })

    if True:
        # if getCache:
        # Store the string of chunks data into memory.
        list_posting["chunks_str"] = list_data_str[int(lexicon_node_obj.
                                                       meta_length):]
    else:
        size = 0
        chunks_str = list_data_str[int(lexicon_node_obj.meta_length):]
        for i in range(0, len(list_data), 2):
            chunk_content = decode7bit(chunks_str[size:size +
                                                  list_data[i + 1]])
            chunk_postings = []
            for j in range(0, len(chunk_content), 2):
                if j != 0:
                    chunk_content[j] += chunk_content[j - 2]
                elif i != 0:
                    chunk_content[j] += list_data[i - 2]
                chunk_postings.append({
                    "did": chunk_content[j],
                    "freq": chunk_content[j + 1]
                })
            list_posting[i / 2] = chunk_postings
            size += list_data[i + 1]
    return list_posting