def __init__(self, args):
        self.args = args

        self.tokenizer = Tokenization()
        self.sentenceSegmenter = SentenceSegmentation()
        self.inflectionReducer = InflectionReduction()
        self.stopwordRemover = StopwordRemoval()

        self.informationRetriever = InformationRetrieval()
        self.evaluator = Evaluation()
class SearchEngine:
    def __init__(self, args):
        self.args = args
        self.tokenizer = Tokenization()
        self.sentenceSegmenter = SentenceSegmentation()
        self.inflectionReducer = InflectionReduction()
        self.stopwordRemover = StopwordRemoval()

    def segmentSentences(self, text):
        """
		Return the required sentence segmenter
		"""
        if self.args.segmenter == "naive":
            return self.sentenceSegmenter.naive(text)
        elif self.args.segmenter == "punkt":
            return self.sentenceSegmenter.punkt(text)

    def tokenize(self, text):
        """
		Return the required tokenizer
		"""
        if self.args.tokenizer == "naive":
            return self.tokenizer.naive(text)
        elif self.args.tokenizer == "ptb":
            return self.tokenizer.pennTreeBank(text)

    def reduceInflection(self, text):
        """
		Return the required stemmer/lemmatizer
		"""
        return self.inflectionReducer.reduce(text)

    def removeStopwords(self, text):
        """
		Return the required stopword remover
		"""
        return self.stopwordRemover.fromList(text)

    def preprocessQueries(self, queries):
        """
		Preprocess the queries - segment, tokenize, stem/lemmatize and remove stopwords
		"""

        # Segment queries
        segmentedQueries = []
        for query in queries:
            segmentedQuery = self.segmentSentences(query)
            segmentedQueries.append(segmentedQuery)
        json.dump(segmentedQueries,
                  open(self.args.out_folder + "segmented_queries.txt", 'w'))
        # Tokenize queries
        tokenizedQueries = []
        for query in segmentedQueries:
            tokenizedQuery = self.tokenize(query)
            tokenizedQueries.append(tokenizedQuery)
        json.dump(tokenizedQueries,
                  open(self.args.out_folder + "tokenized_queries.txt", 'w'))
        # Stem/Lemmatize queries
        reducedQueries = []
        for query in tokenizedQueries:
            reducedQuery = self.reduceInflection(query)
            reducedQueries.append(reducedQuery)
        json.dump(reducedQueries,
                  open(self.args.out_folder + "reduced_queries.txt", 'w'))
        # Remove stopwords from queries
        stopwordRemovedQueries = []
        for query in reducedQueries:
            stopwordRemovedQuery = self.removeStopwords(query)
            stopwordRemovedQueries.append(stopwordRemovedQuery)
        json.dump(
            stopwordRemovedQueries,
            open(self.args.out_folder + "stopword_removed_queries.txt", 'w'))

        preprocessedQueries = stopwordRemovedQueries
        return preprocessedQueries

    def preprocessDocs(self, docs):
        """
		Preprocess the documents
		"""

        # Segment docs
        segmentedDocs = []
        for doc in docs:
            segmentedDoc = self.segmentSentences(doc)
            segmentedDocs.append(segmentedDoc)
        json.dump(segmentedDocs,
                  open(self.args.out_folder + "segmented_docs.txt", 'w'))
        # Tokenize docs
        tokenizedDocs = []
        for doc in segmentedDocs:
            tokenizedDoc = self.tokenize(doc)
            tokenizedDocs.append(tokenizedDoc)
        json.dump(tokenizedDocs,
                  open(self.args.out_folder + "tokenized_docs.txt", 'w'))
        # Stem/Lemmatize docs
        reducedDocs = []
        for doc in tokenizedDocs:
            reducedDoc = self.reduceInflection(doc)
            reducedDocs.append(reducedDoc)
        json.dump(reducedDocs,
                  open(self.args.out_folder + "reduced_docs.txt", 'w'))
        # Remove stopwords from docs
        stopwordRemovedDocs = []
        for doc in reducedDocs:
            stopwordRemovedDoc = self.removeStopwords(doc)
            stopwordRemovedDocs.append(stopwordRemovedDoc)
        json.dump(
            stopwordRemovedDocs,
            open(self.args.out_folder + "stopword_removed_docs.txt", 'w'))

        preprocessedDocs = stopwordRemovedDocs
        return preprocessedDocs

    def evaluateDataset(self):
        """
		Evaluate document-query relevances for all document-query pairs
		"""

        # Read queries
        queries_json = json.load(open(args.dataset + "cran_queries.json",
                                      'r'))[:]
        queries = [item["query"] for item in queries_json]
        # Process queries
        processedQueries = self.preprocessQueries(queries)

        # Read documents
        docs_json = json.load(open(args.dataset + "cran_docs.json",
                                   'r'))[:][:4]
        docs = [item["body"] for item in docs_json]
        # Process documents
        processedDocs = self.preprocessDocs(docs)

        # Remaning code will be added later

    def handleCustomQuery(self):
        """
		Take a custom query as input and return relevances with all documents
		"""

        #Get query
        print("Enter query below")
        query = input()
        # Process documents
        processedQuery = self.preprocessQueries([query])[0]

        # Read documents
        docs_json = json.load(open(args.dataset + "cran_docs.json", 'r'))[:10]
        docs = [item["body"] for item in docs_json]
        # Process documents
        processedDocs = self.preprocessDocs(docs)
Example #3
0
import spacy
from nltk.stem import SnowballStemmer
import nltk
from sentenceSegmentation import SentenceSegmentation

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# run 'python -m spacy download en' if you have installed for the first time
nlp = spacy.load('en_core_web_sm')
document = u"doctor operates on patient.Kidney operation is important. operational procedure. leaves and leaf 99 fg"

sb = SnowballStemmer('english')
wn = WordNetLemmatizer()
ss = SentenceSegmentation()
for sent in ss.punkt(document):
    doc = nlp(sent)
    print('spacy -', [token.lemma_ for token in doc])
    print('stemming -', [sb.stem(token.text) for token in doc])
    print('wordnet -', [wn.lemmatize(token.text) for token in doc])
Example #4
0
			A list of lists where each sub-list is a sequence of tokens
		"""
        penn = TreebankWordTokenizer()
        tokenizedText = []
        for sentence in text:
            tokenizedText.append(penn.tokenize(sentence))

        # Fill in code here

        return tokenizedText


if __name__ == "__main__":
    queries_json = json.load(
        open(r'D:\PycharmProjects\nlp\cranfield\cran_queries.json', 'r'))[:]
    segmenter = SentenceSegmentation()
    segmented_queries = [
        segmenter.naive(item["query"]) for item in queries_json
    ]
    count = 0
    tokenizer = Tokenization()
    for query in segmented_queries:
        naive_res = tokenizer.naive(query)
        punkt_res = tokenizer.pennTreeBank(query)
        if naive_res != punkt_res:
            count += 1
        #   print(naive_res)
        #   print(punkt_res)
        #   print(query)
    print('ratio of not matched for segmented_queries:' + str(count) + '/' +
          str(len(segmented_queries)))
class SearchEngine:
    def __init__(self, args):
        self.args = args

        self.tokenizer = Tokenization()
        self.sentenceSegmenter = SentenceSegmentation()
        self.inflectionReducer = InflectionReduction()
        self.stopwordRemover = StopwordRemoval()

        self.informationRetriever = InformationRetrieval()
        self.evaluator = Evaluation()

    def segmentSentences(self, text):
        """
		Call the required sentence segmenter
		"""
        if self.args.segmenter == "naive":
            return self.sentenceSegmenter.naive(text)
        elif self.args.segmenter == "punkt":
            return self.sentenceSegmenter.punkt(text)

    def tokenize(self, text):
        """
		Call the required tokenizer
		"""
        if self.args.tokenizer == "naive":
            return self.tokenizer.naive(text)
        elif self.args.tokenizer == "ptb":
            return self.tokenizer.pennTreeBank(text)

    def reduceInflection(self, text):
        """
		Call the required stemmer/lemmatizer
		"""
        return self.inflectionReducer.reduce(text)

    def removeStopwords(self, text):
        """
		Call the required stopword remover
		"""
        return self.stopwordRemover.fromList(text)

    def preprocessQueries(self, queries):
        """
		Preprocess the queries - segment, tokenize, stem/lemmatize and remove stopwords
		"""

        # Segment queries
        segmentedQueries = []
        for query in queries:
            segmentedQuery = self.segmentSentences(query)
            segmentedQueries.append(segmentedQuery)
        json.dump(segmentedQueries,
                  open(self.args.out_folder + "segmented_queries.txt", 'w'))
        # Tokenize queries
        tokenizedQueries = []
        for query in segmentedQueries:
            tokenizedQuery = self.tokenize(query)
            tokenizedQueries.append(tokenizedQuery)
        json.dump(tokenizedQueries,
                  open(self.args.out_folder + "tokenized_queries.txt", 'w'))
        # Stem/Lemmatize queries
        reducedQueries = []
        for query in tokenizedQueries:
            reducedQuery = self.reduceInflection(query)
            reducedQueries.append(reducedQuery)
        json.dump(reducedQueries,
                  open(self.args.out_folder + "reduced_queries.txt", 'w'))
        # Remove stopwords from queries
        stopwordRemovedQueries = []
        for query in reducedQueries:
            stopwordRemovedQuery = self.removeStopwords(query)
            stopwordRemovedQueries.append(stopwordRemovedQuery)
        json.dump(
            stopwordRemovedQueries,
            open(self.args.out_folder + "stopword_removed_queries.txt", 'w'))

        preprocessedQueries = stopwordRemovedQueries
        return preprocessedQueries

    def preprocessDocs(self, docs):
        """
		Preprocess the documents
		"""

        # Segment docs
        segmentedDocs = []
        for doc in docs:
            segmentedDoc = self.segmentSentences(doc)
            segmentedDocs.append(segmentedDoc)
        json.dump(segmentedDocs,
                  open(self.args.out_folder + "segmented_docs.txt", 'w'))
        # Tokenize docs

        tokenizedDocs = []
        for doc in segmentedDocs:
            tokenizedDoc = self.tokenize(doc)
            tokenizedDocs.append(tokenizedDoc)
        json.dump(tokenizedDocs,
                  open(self.args.out_folder + "tokenized_docs.txt", 'w'))
        # Stem/Lemmatize docs

        reducedDocs = []
        for doc in tokenizedDocs:
            reducedDoc = self.reduceInflection(doc)
            reducedDocs.append(reducedDoc)
        json.dump(reducedDocs,
                  open(self.args.out_folder + "reduced_docs.txt", 'w'))
        # Remove stopwords from docs

        stopwordRemovedDocs = []
        for doc in reducedDocs:
            stopwordRemovedDoc = self.removeStopwords(doc)
            stopwordRemovedDocs.append(stopwordRemovedDoc)
        json.dump(
            stopwordRemovedDocs,
            open(self.args.out_folder + "stopword_removed_docs.txt", 'w'))

        preprocessedDocs = stopwordRemovedDocs
        return preprocessedDocs

    def evaluateDataset(self):
        """
		- preprocesses the queries and documents, stores in output folder
		- invokes the IR system
		- evaluates precision, recall, fscore, nDCG and MAP 
		  for all queries in the Cranfield dataset
		- produces graphs of the evaluation metrics in the output folder
		"""

        # Read queries
        queries_json = json.load(open(args.dataset + "cran_queries.json",
                                      'r'))[:]
        query_ids, queries = [item["query number"] for item in queries_json], \
              [item["query"] for item in queries_json]
        # Process queries
        processedQueries = self.preprocessQueries(queries)

        # Read documents
        docs_json = json.load(open(args.dataset + "cran_docs.json", 'r'))[:]
        doc_ids, docs = [item["id"] for item in docs_json], \
              [item["body"] for item in docs_json]
        # Process documents
        processedDocs = self.preprocessDocs(docs)

        # Build document index
        self.informationRetriever.buildIndex(processedDocs, doc_ids)
        # Rank the documents for each query
        doc_IDs_ordered = self.informationRetriever.rank(processedQueries)

        # Read relevance judements
        qrels = json.load(open(args.dataset + "cran_qrels.json", 'r'))[:]

        # Calculate precision, recall, f-score, MAP and nDCG for k = 1 to 10
        precisions, recalls, fscores, MAPs, nDCGs = [], [], [], [], []
        for k in range(1, 11):
            precision = self.evaluator.meanPrecision(doc_IDs_ordered,
                                                     query_ids, qrels, k)
            precisions.append(precision)
            recall = self.evaluator.meanRecall(doc_IDs_ordered, query_ids,
                                               qrels, k)
            recalls.append(recall)
            fscore = self.evaluator.meanFscore(doc_IDs_ordered, query_ids,
                                               qrels, k)
            fscores.append(fscore)
            print("Precision, Recall and F-score @ " + str(k) + " : " +
                  str(precision) + ", " + str(recall) + ", " + str(fscore))
            MAP = self.evaluator.meanAveragePrecision(doc_IDs_ordered,
                                                      query_ids, qrels, k)
            MAPs.append(MAP)
            nDCG = self.evaluator.meanNDCG(doc_IDs_ordered, query_ids, qrels,
                                           k)
            nDCGs.append(nDCG)
            print("MAP, nDCG @ " + str(k) + " : " + str(MAP) + ", " +
                  str(nDCG))

        #Plot the metrics and save plot
        plt.plot(range(1, 11), precisions, label="Precision")
        plt.plot(range(1, 11), recalls, label="Recall")
        plt.plot(range(1, 11), fscores, label="F-Score")
        plt.plot(range(1, 11), MAPs, label="MAP")
        plt.plot(range(1, 11), nDCGs, label="nDCG")
        plt.legend()
        plt.title("Evaluation Metrics - Cranfield Dataset")
        plt.xlabel("k")
        plt.savefig(args.out_folder + "eval_plot.png")

    def handleCustomQuery(self):
        """
		Take a custom query as input and return top five relevant documents
		"""

        #Get query
        print("Enter query below")
        query = input()
        # Process documents
        processedQuery = self.preprocessQueries([query])[0]

        # Read documents
        docs_json = json.load(open(args.dataset + "cran_docs.json", 'r'))[:]
        doc_ids, docs = [item["id"] for item in docs_json], \
             [item["body"] for item in docs_json]
        # Process documents
        processedDocs = self.preprocessDocs(docs)

        # Build document index
        self.informationRetriever.buildIndex(processedDocs, doc_ids)
        # Rank the documents for the query
        doc_IDs_ordered = self.informationRetriever.rank([processedQuery])[0]

        # Print the IDs of first five documents
        print("\nTop five document IDs : ")
        for id_ in doc_IDs_ordered[:5]:
            print(id_)