def main(): alphabets = [chr(x) for x in range(ord('a'), ord('z') + 1)] alphabets.append("num") current_dict = dict() N = 55393 for i in alphabets: f = open(i + ".txt", 'rb') current_indexer = InvertedIndex() current_indexer.merge(pickle.load(f)) tf_idf_score = dict() f.close() # Calculating tf-idf score for each separate index for token, dictionary in current_indexer.getDict().items(): df = len(dictionary.keys()) current_dict = dict() for docID, tf in dictionary.items(): current_dict[docID] = round(tf * math.log(N / df, 10), 4) current_indexer.getDict()[token].clear() current_indexer.getDict()[token] = current_dict # Store the results into tf_score_(insert here).txt f = open("tf_score_" + i + ".txt", 'wb') pickle.dump(current_indexer.getDict(), f) f.close()
def main(): i = 0 final_result = InvertedIndex() # Load all indexes into one InvertedIndex() (class object for index) for i in range(88): f = open("indexer" + str(i) + ".txt", 'rb') final_result.merge(pickle.load(f)) f.close() alphanumIndex = [dict() for i in range(27) ] # length of alphabet + number = 26 + 1 = 27 for i in final_result.getDict(): if 97 <= ord( i[0] ) <= 122: # if the key of the index is part of english alphabet alphanumIndex[ord(i[0]) - 97][i] = final_result.getDict( )[i] # the first index selects which partial dictionary the key belongs to # the second index maps the key i to the value of the key i from the merged index else: alphanumIndex[26][i] = final_result.getDict( )[i] # the key is not part of eng. alphabet -> insert in number index ## create partial file for each letter and dump the dict into that file ASCII_code = 97 # ascii of 'a' for i in range(len(alphanumIndex)): # == 27 if i == len( alphanumIndex) - 1: # at the last index, reserved for numbers filename = "num" else: filename = chr(ASCII_code + i) fileObject = open(f"{filename}.txt", "wb") pickle.dump(alphanumIndex[i], fileObject) fileObject.close()