for term in tokens: vocabulary.add(term) document = Document(i, paragraphs) documents.append(document) # Length of vocabulary vocabularyLength = len(vocabulary) # print(vocabularyLength) # Creating the inverted index indexer = Indexer(documents) # Take filename as input for processing inputDocument = str(sys.argv[1]) # inputDocument = 'test.txt' raw = None with open(inputDocument, encoding="utf8", errors="ignore") as input_file: raw = input_file.read() paras = paragraph_tokenizer(raw) paragraphs = [] for j, para in enumerate(paras): tokens = preprocessor(para) _id = (-1, j) # -1 so that it is different from other documents in the corpus paragraph = Paragraph(_id, tokens) paragraphs.append(paragraph) input_doc = Document(-1, paragraphs) top_k, uniqueness = indexer.evaluate_input(input_doc, files, 10) for i in range(len(top_k)): score, filename = top_k[i] print('Document: ' + str(filename), 'Score: ' + str(score)) print('Uniqueness: ' + str(uniqueness) + ' %')