Example #1
0
                for term in tokens:
                    vocabulary.add(term)
            document = Document(i, paragraphs)
            documents.append(document)

    # Length of vocabulary
    vocabularyLength = len(vocabulary)
    # print(vocabularyLength)
    
    # Creating the inverted index
    indexer = Indexer(documents)

    # Take filename as input for processing
    inputDocument = str(sys.argv[1])
    # inputDocument = 'test.txt'
    raw = None
    with open(inputDocument, encoding="utf8", errors="ignore") as input_file:
        raw = input_file.read()
    paras = paragraph_tokenizer(raw)
    paragraphs = []
    for j, para in enumerate(paras):
        tokens = preprocessor(para)
        _id = (-1, j) # -1 so that it is different from other documents in the corpus
        paragraph = Paragraph(_id, tokens)
        paragraphs.append(paragraph)
    input_doc = Document(-1, paragraphs)
    top_k, uniqueness = indexer.evaluate_input(input_doc, files, 10)
    for i in range(len(top_k)):
        score, filename = top_k[i]
        print('Document: ' + str(filename), 'Score: ' + str(score))
    print('Uniqueness: ' + str(uniqueness) + ' %')