def __init__(self, args): self.args = args self.tokenizer = Tokenization() self.sentenceSegmenter = SentenceSegmentation() self.inflectionReducer = InflectionReduction() self.stopwordRemover = StopwordRemoval() self.informationRetriever = InformationRetrieval() self.evaluator = Evaluation()
class SearchEngine: def __init__(self, args): self.args = args self.tokenizer = Tokenization() self.sentenceSegmenter = SentenceSegmentation() self.inflectionReducer = InflectionReduction() self.stopwordRemover = StopwordRemoval() def segmentSentences(self, text): """ Return the required sentence segmenter """ if self.args.segmenter == "naive": return self.sentenceSegmenter.naive(text) elif self.args.segmenter == "punkt": return self.sentenceSegmenter.punkt(text) def tokenize(self, text): """ Return the required tokenizer """ if self.args.tokenizer == "naive": return self.tokenizer.naive(text) elif self.args.tokenizer == "ptb": return self.tokenizer.pennTreeBank(text) def reduceInflection(self, text): """ Return the required stemmer/lemmatizer """ return self.inflectionReducer.reduce(text) def removeStopwords(self, text): """ Return the required stopword remover """ return self.stopwordRemover.fromList(text) def preprocessQueries(self, queries): """ Preprocess the queries - segment, tokenize, stem/lemmatize and remove stopwords """ # Segment queries segmentedQueries = [] for query in queries: segmentedQuery = self.segmentSentences(query) segmentedQueries.append(segmentedQuery) json.dump(segmentedQueries, open(self.args.out_folder + "segmented_queries.txt", 'w')) # Tokenize queries tokenizedQueries = [] for query in segmentedQueries: tokenizedQuery = self.tokenize(query) tokenizedQueries.append(tokenizedQuery) json.dump(tokenizedQueries, open(self.args.out_folder + "tokenized_queries.txt", 'w')) # Stem/Lemmatize queries reducedQueries = [] for query in tokenizedQueries: reducedQuery = self.reduceInflection(query) reducedQueries.append(reducedQuery) json.dump(reducedQueries, open(self.args.out_folder + "reduced_queries.txt", 'w')) # Remove stopwords from queries stopwordRemovedQueries = [] for query in reducedQueries: stopwordRemovedQuery = self.removeStopwords(query) stopwordRemovedQueries.append(stopwordRemovedQuery) json.dump( stopwordRemovedQueries, open(self.args.out_folder + "stopword_removed_queries.txt", 'w')) preprocessedQueries = stopwordRemovedQueries return preprocessedQueries def preprocessDocs(self, docs): """ Preprocess the documents """ # Segment docs segmentedDocs = [] for doc in docs: segmentedDoc = self.segmentSentences(doc) segmentedDocs.append(segmentedDoc) json.dump(segmentedDocs, open(self.args.out_folder + "segmented_docs.txt", 'w')) # Tokenize docs tokenizedDocs = [] for doc in segmentedDocs: tokenizedDoc = self.tokenize(doc) tokenizedDocs.append(tokenizedDoc) json.dump(tokenizedDocs, open(self.args.out_folder + "tokenized_docs.txt", 'w')) # Stem/Lemmatize docs reducedDocs = [] for doc in tokenizedDocs: reducedDoc = self.reduceInflection(doc) reducedDocs.append(reducedDoc) json.dump(reducedDocs, open(self.args.out_folder + "reduced_docs.txt", 'w')) # Remove stopwords from docs stopwordRemovedDocs = [] for doc in reducedDocs: stopwordRemovedDoc = self.removeStopwords(doc) stopwordRemovedDocs.append(stopwordRemovedDoc) json.dump( stopwordRemovedDocs, open(self.args.out_folder + "stopword_removed_docs.txt", 'w')) preprocessedDocs = stopwordRemovedDocs return preprocessedDocs def evaluateDataset(self): """ Evaluate document-query relevances for all document-query pairs """ # Read queries queries_json = json.load(open(args.dataset + "cran_queries.json", 'r'))[:] queries = [item["query"] for item in queries_json] # Process queries processedQueries = self.preprocessQueries(queries) # Read documents docs_json = json.load(open(args.dataset + "cran_docs.json", 'r'))[:][:4] docs = [item["body"] for item in docs_json] # Process documents processedDocs = self.preprocessDocs(docs) # Remaning code will be added later def handleCustomQuery(self): """ Take a custom query as input and return relevances with all documents """ #Get query print("Enter query below") query = input() # Process documents processedQuery = self.preprocessQueries([query])[0] # Read documents docs_json = json.load(open(args.dataset + "cran_docs.json", 'r'))[:10] docs = [item["body"] for item in docs_json] # Process documents processedDocs = self.preprocessDocs(docs)
import spacy from nltk.stem import SnowballStemmer import nltk from sentenceSegmentation import SentenceSegmentation nltk.download('wordnet') from nltk.stem import WordNetLemmatizer # run 'python -m spacy download en' if you have installed for the first time nlp = spacy.load('en_core_web_sm') document = u"doctor operates on patient.Kidney operation is important. operational procedure. leaves and leaf 99 fg" sb = SnowballStemmer('english') wn = WordNetLemmatizer() ss = SentenceSegmentation() for sent in ss.punkt(document): doc = nlp(sent) print('spacy -', [token.lemma_ for token in doc]) print('stemming -', [sb.stem(token.text) for token in doc]) print('wordnet -', [wn.lemmatize(token.text) for token in doc])
A list of lists where each sub-list is a sequence of tokens """ penn = TreebankWordTokenizer() tokenizedText = [] for sentence in text: tokenizedText.append(penn.tokenize(sentence)) # Fill in code here return tokenizedText if __name__ == "__main__": queries_json = json.load( open(r'D:\PycharmProjects\nlp\cranfield\cran_queries.json', 'r'))[:] segmenter = SentenceSegmentation() segmented_queries = [ segmenter.naive(item["query"]) for item in queries_json ] count = 0 tokenizer = Tokenization() for query in segmented_queries: naive_res = tokenizer.naive(query) punkt_res = tokenizer.pennTreeBank(query) if naive_res != punkt_res: count += 1 # print(naive_res) # print(punkt_res) # print(query) print('ratio of not matched for segmented_queries:' + str(count) + '/' + str(len(segmented_queries)))
class SearchEngine: def __init__(self, args): self.args = args self.tokenizer = Tokenization() self.sentenceSegmenter = SentenceSegmentation() self.inflectionReducer = InflectionReduction() self.stopwordRemover = StopwordRemoval() self.informationRetriever = InformationRetrieval() self.evaluator = Evaluation() def segmentSentences(self, text): """ Call the required sentence segmenter """ if self.args.segmenter == "naive": return self.sentenceSegmenter.naive(text) elif self.args.segmenter == "punkt": return self.sentenceSegmenter.punkt(text) def tokenize(self, text): """ Call the required tokenizer """ if self.args.tokenizer == "naive": return self.tokenizer.naive(text) elif self.args.tokenizer == "ptb": return self.tokenizer.pennTreeBank(text) def reduceInflection(self, text): """ Call the required stemmer/lemmatizer """ return self.inflectionReducer.reduce(text) def removeStopwords(self, text): """ Call the required stopword remover """ return self.stopwordRemover.fromList(text) def preprocessQueries(self, queries): """ Preprocess the queries - segment, tokenize, stem/lemmatize and remove stopwords """ # Segment queries segmentedQueries = [] for query in queries: segmentedQuery = self.segmentSentences(query) segmentedQueries.append(segmentedQuery) json.dump(segmentedQueries, open(self.args.out_folder + "segmented_queries.txt", 'w')) # Tokenize queries tokenizedQueries = [] for query in segmentedQueries: tokenizedQuery = self.tokenize(query) tokenizedQueries.append(tokenizedQuery) json.dump(tokenizedQueries, open(self.args.out_folder + "tokenized_queries.txt", 'w')) # Stem/Lemmatize queries reducedQueries = [] for query in tokenizedQueries: reducedQuery = self.reduceInflection(query) reducedQueries.append(reducedQuery) json.dump(reducedQueries, open(self.args.out_folder + "reduced_queries.txt", 'w')) # Remove stopwords from queries stopwordRemovedQueries = [] for query in reducedQueries: stopwordRemovedQuery = self.removeStopwords(query) stopwordRemovedQueries.append(stopwordRemovedQuery) json.dump( stopwordRemovedQueries, open(self.args.out_folder + "stopword_removed_queries.txt", 'w')) preprocessedQueries = stopwordRemovedQueries return preprocessedQueries def preprocessDocs(self, docs): """ Preprocess the documents """ # Segment docs segmentedDocs = [] for doc in docs: segmentedDoc = self.segmentSentences(doc) segmentedDocs.append(segmentedDoc) json.dump(segmentedDocs, open(self.args.out_folder + "segmented_docs.txt", 'w')) # Tokenize docs tokenizedDocs = [] for doc in segmentedDocs: tokenizedDoc = self.tokenize(doc) tokenizedDocs.append(tokenizedDoc) json.dump(tokenizedDocs, open(self.args.out_folder + "tokenized_docs.txt", 'w')) # Stem/Lemmatize docs reducedDocs = [] for doc in tokenizedDocs: reducedDoc = self.reduceInflection(doc) reducedDocs.append(reducedDoc) json.dump(reducedDocs, open(self.args.out_folder + "reduced_docs.txt", 'w')) # Remove stopwords from docs stopwordRemovedDocs = [] for doc in reducedDocs: stopwordRemovedDoc = self.removeStopwords(doc) stopwordRemovedDocs.append(stopwordRemovedDoc) json.dump( stopwordRemovedDocs, open(self.args.out_folder + "stopword_removed_docs.txt", 'w')) preprocessedDocs = stopwordRemovedDocs return preprocessedDocs def evaluateDataset(self): """ - preprocesses the queries and documents, stores in output folder - invokes the IR system - evaluates precision, recall, fscore, nDCG and MAP for all queries in the Cranfield dataset - produces graphs of the evaluation metrics in the output folder """ # Read queries queries_json = json.load(open(args.dataset + "cran_queries.json", 'r'))[:] query_ids, queries = [item["query number"] for item in queries_json], \ [item["query"] for item in queries_json] # Process queries processedQueries = self.preprocessQueries(queries) # Read documents docs_json = json.load(open(args.dataset + "cran_docs.json", 'r'))[:] doc_ids, docs = [item["id"] for item in docs_json], \ [item["body"] for item in docs_json] # Process documents processedDocs = self.preprocessDocs(docs) # Build document index self.informationRetriever.buildIndex(processedDocs, doc_ids) # Rank the documents for each query doc_IDs_ordered = self.informationRetriever.rank(processedQueries) # Read relevance judements qrels = json.load(open(args.dataset + "cran_qrels.json", 'r'))[:] # Calculate precision, recall, f-score, MAP and nDCG for k = 1 to 10 precisions, recalls, fscores, MAPs, nDCGs = [], [], [], [], [] for k in range(1, 11): precision = self.evaluator.meanPrecision(doc_IDs_ordered, query_ids, qrels, k) precisions.append(precision) recall = self.evaluator.meanRecall(doc_IDs_ordered, query_ids, qrels, k) recalls.append(recall) fscore = self.evaluator.meanFscore(doc_IDs_ordered, query_ids, qrels, k) fscores.append(fscore) print("Precision, Recall and F-score @ " + str(k) + " : " + str(precision) + ", " + str(recall) + ", " + str(fscore)) MAP = self.evaluator.meanAveragePrecision(doc_IDs_ordered, query_ids, qrels, k) MAPs.append(MAP) nDCG = self.evaluator.meanNDCG(doc_IDs_ordered, query_ids, qrels, k) nDCGs.append(nDCG) print("MAP, nDCG @ " + str(k) + " : " + str(MAP) + ", " + str(nDCG)) #Plot the metrics and save plot plt.plot(range(1, 11), precisions, label="Precision") plt.plot(range(1, 11), recalls, label="Recall") plt.plot(range(1, 11), fscores, label="F-Score") plt.plot(range(1, 11), MAPs, label="MAP") plt.plot(range(1, 11), nDCGs, label="nDCG") plt.legend() plt.title("Evaluation Metrics - Cranfield Dataset") plt.xlabel("k") plt.savefig(args.out_folder + "eval_plot.png") def handleCustomQuery(self): """ Take a custom query as input and return top five relevant documents """ #Get query print("Enter query below") query = input() # Process documents processedQuery = self.preprocessQueries([query])[0] # Read documents docs_json = json.load(open(args.dataset + "cran_docs.json", 'r'))[:] doc_ids, docs = [item["id"] for item in docs_json], \ [item["body"] for item in docs_json] # Process documents processedDocs = self.preprocessDocs(docs) # Build document index self.informationRetriever.buildIndex(processedDocs, doc_ids) # Rank the documents for the query doc_IDs_ordered = self.informationRetriever.rank([processedQuery])[0] # Print the IDs of first five documents print("\nTop five document IDs : ") for id_ in doc_IDs_ordered[:5]: print(id_)