Beispiel #1
0
def main():
    alphabets = [chr(x) for x in range(ord('a'), ord('z') + 1)]
    alphabets.append("num")
    current_dict = dict()
    N = 55393

    for i in alphabets:
        f = open(i + ".txt", 'rb')
        current_indexer = InvertedIndex()
        current_indexer.merge(pickle.load(f))
        tf_idf_score = dict()
        f.close()
        # Calculating tf-idf score for each separate index
        for token, dictionary in current_indexer.getDict().items():
            df = len(dictionary.keys())
            current_dict = dict()
            for docID, tf in dictionary.items():
                current_dict[docID] = round(tf * math.log(N / df, 10), 4)
            current_indexer.getDict()[token].clear()
            current_indexer.getDict()[token] = current_dict
        # Store the results into tf_score_(insert here).txt
        f = open("tf_score_" + i + ".txt", 'wb')
        pickle.dump(current_indexer.getDict(), f)
        f.close()
Beispiel #2
0
def main():
    i = 0
    final_result = InvertedIndex()

    # Load all indexes into one InvertedIndex() (class object for index)
    for i in range(88):
        f = open("indexer" + str(i) + ".txt", 'rb')
        final_result.merge(pickle.load(f))
        f.close()

    alphanumIndex = [dict() for i in range(27)
                     ]  # length of alphabet + number = 26 + 1 = 27

    for i in final_result.getDict():
        if 97 <= ord(
                i[0]
        ) <= 122:  # if the key of the index is part of english alphabet
            alphanumIndex[ord(i[0]) - 97][i] = final_result.getDict(
            )[i]  # the first index selects which partial dictionary the key belongs to
            # the second index maps the key i to the value of the key i from the merged index
        else:
            alphanumIndex[26][i] = final_result.getDict(
            )[i]  # the key is not part of eng. alphabet -> insert in number index

    ## create partial file for each letter and dump the dict into that file
    ASCII_code = 97  # ascii of 'a'
    for i in range(len(alphanumIndex)):  # == 27
        if i == len(
                alphanumIndex) - 1:  # at the last index, reserved for numbers
            filename = "num"
        else:
            filename = chr(ASCII_code + i)

        fileObject = open(f"{filename}.txt", "wb")
        pickle.dump(alphanumIndex[i], fileObject)
        fileObject.close()