def __init__(self, docs, gram): self.index = {} self.docs = docs self.gram = gram if self.gram == 2: all_words = unique([inner for outer in docs for inner in docs[outer].words]) new_docs = {} for i in range(len(all_words)): word = "#"+all_words[i]+"#" id = i words = [word[j:j+2] for j in range(len(word) - 1)] doc = Doc(id, ' '.join(words)) doc.words = words new_docs[id] = doc self.docs = new_docs self.create_index()
preprocessor = EnglishPreprocessor(docs) else: docs = read_docs('../data/Persian.xml') preprocessor = PersianPreprocessor(docs) for doc in docs.values(): doc.words = preprocessor.preprocess(doc.text) print("Preprocess is done!") index = PositionalIndexer(docs, 1).index print("Index Created Successfully!") query = input("Enter Query: ") q_doc = Doc(0, query) q_doc.words = preprocessor.preprocess(q_doc.text) query_tag = input("Enter Tag (1, 2, 3, 4, None): ") tag = None if query_tag in ["1", "2", "3", "4"]: tag = int(query_tag) if tag is not None: classify(docs) results = search(q_doc, docs, index, 10, query_tag) for result in results: print(result[1]) print(result[0].text) print()