previous_key = '' for key, value in sorted_token_pairs: if previous_key != key: previous_key = key dictionary_[previous_key] = [value] else: if value not in dictionary_[previous_key]: dictionary_[previous_key].append(value) for key in dictionary_: value = dictionary_[key] dictionary_[key] = (len(value), value) return dictionary_ if __name__ == "__main__": # Standalone Test fileList = ListFiles(rootDir) tokenPair = [] for i in range(2): tokenPair = tokenPair + GenerateTokens(GetFileContents(fileList[i]), fileList[i]) original_list = LingModule(tokenPair) sorted_list = SortTokens(original_list) # print(sorted_list) transformed_postings = TransformationIntoPostings(sorted_list) print(transformed_postings)
# Input to this component will be a concatenated list of tokens from all documents. # Input: list of pairs < token , document id > # Output: sorted list of pairs < token , document id > from CONST import * from Directory_Listing import ListFiles from File_Reading import GetFileContents from Tokenization import GenerateTokens from Linguistic_Modules import LingModule def SortTokens(token_pairs_list): sortedTokens = sorted(token_pairs_list, key=lambda element: (element[0], element[1])) return sortedTokens if __name__ == "__main__": # Standalone Test fileList = ListFiles(rootDir) fileContent_1 = GetFileContents(fileList[0]) tokenPair_1 = GenerateTokens(fileContent_1, fileList[0]) fileContent_2 = GetFileContents(fileList[1]) tokenPair_2 = GenerateTokens(fileContent_2, fileList[1]) tokenPair = tokenPair_1 + tokenPair_2 original_list = LingModule(tokenPair) print(original_list) sorted_list = SortTokens(original_list) print(sorted_list)
if __name__ == "__main__": pid = os.getpid() pyname = os.path.basename(__file__) start = time.time() # Directory Listing all_files = ListFiles(rootDir) #m_s = GetMemory(pid,pyname) dic_tree = Dic_Tree() for file in all_files: # File Reading file_text = GetFileContents(file) tokens = file_text.split() dic_tree.insert_many([LingStr(item) for item in tokens], file) del tokens del file_text #m_e = GetMemory(pid,pyname) time_index = (time.time() - start) * 1000 print("Time for creating index:\t", time_index, "ms") #print("Memory for the index:\t", m_e - m_s, "KB") while True: query = input() q_start = time.time() queries = query.split()
portStemmer = PorterStemmer() snowStemmer = SnowballStemmer("english") def LingStr(token): tmp = re.sub(r'[\^\[\]\-\\!@#$%&*()_=+`~":;|/.,?{}<>\']', '', str(token).lower()) tmp = portStemmer.stem(str(tmp)) tmp = snowStemmer.stem(str(tmp)) return tmp def LingModule(tokenPair): pairList = [] for token, docID in tokenPair: tmp = LingStr(token) if tmp != "": pairList.append((tmp, docID)) return pairList if __name__ == "__main__": # Standalone test fileList = ListFiles(rootDir) fileContent = GetFileContents(fileList[0]) tokenPair = GenerateTokens(fileContent, fileList[0]) outputPair = LingModule(tokenPair) print(outputPair)
# Tokenization # Input: text (file contents), string (document id = path to file) # Output: list of pairs < string (token) , string (document id) > import itertools from CONST import * from File_Reading import GetFileContents import re def GenerateTokens(contents, filePath): tokens = contents.split() return list(zip(tokens, itertools.repeat(filePath))) if __name__ == '__main__': # Standalone Test filePath = rootDir + "1.txt" contents = GetFileContents(filePath) tokens = GenerateTokens(contents, filePath) print(contents) print("tokens") print(tokens)
#from Get_Memory_Req import GetMemory if __name__ == "__main__": pid = os.getpid() pyname = os.path.basename(__file__) start = time.time() # Directory Listing all_files = ListFiles(rootDir) all_token_pairs = [] for file in all_files: # File Reading file_text = GetFileContents(file) # Tokenization token_pairs = GenerateTokens(file_text, file) # Linguistic Modules modified_token_pairs = LingModule(token_pairs) all_token_pairs += modified_token_pairs del file_text del token_pairs del modified_token_pairs # Sorting the Tokens sorted_tokens = SortTokens(all_token_pairs) del all_token_pairs # Transformation into Postings #m_s = GetMemory(pid,pyname)