def __init__(self, args): self.args = args self.tokenizer = Tokenization() self.sentenceSegmenter = SentenceSegmentation() self.inflectionReducer = InflectionReduction() self.stopwordRemover = StopwordRemoval() self.informationRetriever = InformationRetrieval() self.evaluator = Evaluation()
class SearchEngine: def __init__(self, args): self.args = args self.tokenizer = Tokenization() self.sentenceSegmenter = SentenceSegmentation() self.inflectionReducer = InflectionReduction() self.stopwordRemover = StopwordRemoval() def segmentSentences(self, text): """ Return the required sentence segmenter """ if self.args.segmenter == "naive": return self.sentenceSegmenter.naive(text) elif self.args.segmenter == "punkt": return self.sentenceSegmenter.punkt(text) def tokenize(self, text): """ Return the required tokenizer """ if self.args.tokenizer == "naive": return self.tokenizer.naive(text) elif self.args.tokenizer == "ptb": return self.tokenizer.pennTreeBank(text) def reduceInflection(self, text): """ Return the required stemmer/lemmatizer """ return self.inflectionReducer.reduce(text) def removeStopwords(self, text): """ Return the required stopword remover """ return self.stopwordRemover.fromList(text) def preprocessQueries(self, queries): """ Preprocess the queries - segment, tokenize, stem/lemmatize and remove stopwords """ # Segment queries segmentedQueries = [] for query in queries: segmentedQuery = self.segmentSentences(query) segmentedQueries.append(segmentedQuery) json.dump(segmentedQueries, open(self.args.out_folder + "segmented_queries.txt", 'w')) # Tokenize queries tokenizedQueries = [] for query in segmentedQueries: tokenizedQuery = self.tokenize(query) tokenizedQueries.append(tokenizedQuery) json.dump(tokenizedQueries, open(self.args.out_folder + "tokenized_queries.txt", 'w')) # Stem/Lemmatize queries reducedQueries = [] for query in tokenizedQueries: reducedQuery = self.reduceInflection(query) reducedQueries.append(reducedQuery) json.dump(reducedQueries, open(self.args.out_folder + "reduced_queries.txt", 'w')) # Remove stopwords from queries stopwordRemovedQueries = [] for query in reducedQueries: stopwordRemovedQuery = self.removeStopwords(query) stopwordRemovedQueries.append(stopwordRemovedQuery) json.dump( stopwordRemovedQueries, open(self.args.out_folder + "stopword_removed_queries.txt", 'w')) preprocessedQueries = stopwordRemovedQueries return preprocessedQueries def preprocessDocs(self, docs): """ Preprocess the documents """ # Segment docs segmentedDocs = [] for doc in docs: segmentedDoc = self.segmentSentences(doc) segmentedDocs.append(segmentedDoc) json.dump(segmentedDocs, open(self.args.out_folder + "segmented_docs.txt", 'w')) # Tokenize docs tokenizedDocs = [] for doc in segmentedDocs: tokenizedDoc = self.tokenize(doc) tokenizedDocs.append(tokenizedDoc) json.dump(tokenizedDocs, open(self.args.out_folder + "tokenized_docs.txt", 'w')) # Stem/Lemmatize docs reducedDocs = [] for doc in tokenizedDocs: reducedDoc = self.reduceInflection(doc) reducedDocs.append(reducedDoc) json.dump(reducedDocs, open(self.args.out_folder + "reduced_docs.txt", 'w')) # Remove stopwords from docs stopwordRemovedDocs = [] for doc in reducedDocs: stopwordRemovedDoc = self.removeStopwords(doc) stopwordRemovedDocs.append(stopwordRemovedDoc) json.dump( stopwordRemovedDocs, open(self.args.out_folder + "stopword_removed_docs.txt", 'w')) preprocessedDocs = stopwordRemovedDocs return preprocessedDocs def evaluateDataset(self): """ Evaluate document-query relevances for all document-query pairs """ # Read queries queries_json = json.load(open(args.dataset + "cran_queries.json", 'r'))[:] queries = [item["query"] for item in queries_json] # Process queries processedQueries = self.preprocessQueries(queries) # Read documents docs_json = json.load(open(args.dataset + "cran_docs.json", 'r'))[:][:4] docs = [item["body"] for item in docs_json] # Process documents processedDocs = self.preprocessDocs(docs) # Remaning code will be added later def handleCustomQuery(self): """ Take a custom query as input and return relevances with all documents """ #Get query print("Enter query below") query = input() # Process documents processedQuery = self.preprocessQueries([query])[0] # Read documents docs_json = json.load(open(args.dataset + "cran_docs.json", 'r'))[:10] docs = [item["body"] for item in docs_json] # Process documents processedDocs = self.preprocessDocs(docs)
class SearchEngine: def __init__(self, args): self.args = args self.tokenizer = Tokenization() self.sentenceSegmenter = SentenceSegmentation() self.inflectionReducer = InflectionReduction() self.stopwordRemover = StopwordRemoval() self.informationRetriever = InformationRetrieval() self.evaluator = Evaluation() def segmentSentences(self, text): """ Call the required sentence segmenter """ if self.args.segmenter == "naive": return self.sentenceSegmenter.naive(text) elif self.args.segmenter == "punkt": return self.sentenceSegmenter.punkt(text) def tokenize(self, text): """ Call the required tokenizer """ if self.args.tokenizer == "naive": return self.tokenizer.naive(text) elif self.args.tokenizer == "ptb": return self.tokenizer.pennTreeBank(text) def reduceInflection(self, text): """ Call the required stemmer/lemmatizer """ return self.inflectionReducer.reduce(text) def removeStopwords(self, text): """ Call the required stopword remover """ return self.stopwordRemover.fromList(text) def preprocessQueries(self, queries): """ Preprocess the queries - segment, tokenize, stem/lemmatize and remove stopwords """ # Segment queries segmentedQueries = [] for query in queries: segmentedQuery = self.segmentSentences(query) segmentedQueries.append(segmentedQuery) json.dump(segmentedQueries, open(self.args.out_folder + "segmented_queries.txt", 'w')) # Tokenize queries tokenizedQueries = [] for query in segmentedQueries: tokenizedQuery = self.tokenize(query) tokenizedQueries.append(tokenizedQuery) json.dump(tokenizedQueries, open(self.args.out_folder + "tokenized_queries.txt", 'w')) # Stem/Lemmatize queries reducedQueries = [] for query in tokenizedQueries: reducedQuery = self.reduceInflection(query) reducedQueries.append(reducedQuery) json.dump(reducedQueries, open(self.args.out_folder + "reduced_queries.txt", 'w')) # Remove stopwords from queries stopwordRemovedQueries = [] for query in reducedQueries: stopwordRemovedQuery = self.removeStopwords(query) stopwordRemovedQueries.append(stopwordRemovedQuery) json.dump( stopwordRemovedQueries, open(self.args.out_folder + "stopword_removed_queries.txt", 'w')) preprocessedQueries = stopwordRemovedQueries return preprocessedQueries def preprocessDocs(self, docs): """ Preprocess the documents """ # Segment docs segmentedDocs = [] for doc in docs: segmentedDoc = self.segmentSentences(doc) segmentedDocs.append(segmentedDoc) json.dump(segmentedDocs, open(self.args.out_folder + "segmented_docs.txt", 'w')) # Tokenize docs tokenizedDocs = [] for doc in segmentedDocs: tokenizedDoc = self.tokenize(doc) tokenizedDocs.append(tokenizedDoc) json.dump(tokenizedDocs, open(self.args.out_folder + "tokenized_docs.txt", 'w')) # Stem/Lemmatize docs reducedDocs = [] for doc in tokenizedDocs: reducedDoc = self.reduceInflection(doc) reducedDocs.append(reducedDoc) json.dump(reducedDocs, open(self.args.out_folder + "reduced_docs.txt", 'w')) # Remove stopwords from docs stopwordRemovedDocs = [] for doc in reducedDocs: stopwordRemovedDoc = self.removeStopwords(doc) stopwordRemovedDocs.append(stopwordRemovedDoc) json.dump( stopwordRemovedDocs, open(self.args.out_folder + "stopword_removed_docs.txt", 'w')) preprocessedDocs = stopwordRemovedDocs return preprocessedDocs def evaluateDataset(self): """ - preprocesses the queries and documents, stores in output folder - invokes the IR system - evaluates precision, recall, fscore, nDCG and MAP for all queries in the Cranfield dataset - produces graphs of the evaluation metrics in the output folder """ # Read queries queries_json = json.load(open(args.dataset + "cran_queries.json", 'r'))[:] query_ids, queries = [item["query number"] for item in queries_json], \ [item["query"] for item in queries_json] # Process queries processedQueries = self.preprocessQueries(queries) # Read documents docs_json = json.load(open(args.dataset + "cran_docs.json", 'r'))[:] doc_ids, docs = [item["id"] for item in docs_json], \ [item["body"] for item in docs_json] # Process documents processedDocs = self.preprocessDocs(docs) # Build document index self.informationRetriever.buildIndex(processedDocs, doc_ids) # Rank the documents for each query doc_IDs_ordered = self.informationRetriever.rank(processedQueries) # Read relevance judements qrels = json.load(open(args.dataset + "cran_qrels.json", 'r'))[:] # Calculate precision, recall, f-score, MAP and nDCG for k = 1 to 10 precisions, recalls, fscores, MAPs, nDCGs = [], [], [], [], [] for k in range(1, 11): precision = self.evaluator.meanPrecision(doc_IDs_ordered, query_ids, qrels, k) precisions.append(precision) recall = self.evaluator.meanRecall(doc_IDs_ordered, query_ids, qrels, k) recalls.append(recall) fscore = self.evaluator.meanFscore(doc_IDs_ordered, query_ids, qrels, k) fscores.append(fscore) print("Precision, Recall and F-score @ " + str(k) + " : " + str(precision) + ", " + str(recall) + ", " + str(fscore)) MAP = self.evaluator.meanAveragePrecision(doc_IDs_ordered, query_ids, qrels, k) MAPs.append(MAP) nDCG = self.evaluator.meanNDCG(doc_IDs_ordered, query_ids, qrels, k) nDCGs.append(nDCG) print("MAP, nDCG @ " + str(k) + " : " + str(MAP) + ", " + str(nDCG)) #Plot the metrics and save plot plt.plot(range(1, 11), precisions, label="Precision") plt.plot(range(1, 11), recalls, label="Recall") plt.plot(range(1, 11), fscores, label="F-Score") plt.plot(range(1, 11), MAPs, label="MAP") plt.plot(range(1, 11), nDCGs, label="nDCG") plt.legend() plt.title("Evaluation Metrics - Cranfield Dataset") plt.xlabel("k") plt.savefig(args.out_folder + "eval_plot.png") def handleCustomQuery(self): """ Take a custom query as input and return top five relevant documents """ #Get query print("Enter query below") query = input() # Process documents processedQuery = self.preprocessQueries([query])[0] # Read documents docs_json = json.load(open(args.dataset + "cran_docs.json", 'r'))[:] doc_ids, docs = [item["id"] for item in docs_json], \ [item["body"] for item in docs_json] # Process documents processedDocs = self.preprocessDocs(docs) # Build document index self.informationRetriever.buildIndex(processedDocs, doc_ids) # Rank the documents for the query doc_IDs_ordered = self.informationRetriever.rank([processedQuery])[0] # Print the IDs of first five documents print("\nTop five document IDs : ") for id_ in doc_IDs_ordered[:5]: print(id_)
if __name__ == '__main__': fin = sys.argv[1] num_topics = int(sys.argv[2]) num_words = int(sys.argv[3]) num_iterations = int(sys.argv[4]) no_threads = cpu_count() - 2 print("Start Read File!") df = pd.read_csv(fin) print("End Read File!") print("Start Tokenization!") start = time.time() * 1000 tkn = Tokenization() # with UDF # df = df.apply(tkns, axis=1) # clean_texts = df.to_list() clean_texts = [] with ProcessPoolExecutor(max_workers=no_threads) as worker: for result in worker.map(processElement, df.to_numpy()): if result: clean_texts.append(result) end = time.time() * 1000 print("Execution time (ms)", end - start) print("End Tokenization!") print("Start Vectorization!") vec = Vectorization(clean_texts)