def compute_score(self, models, type, query): if type == 0: return models.score_vsm(parser.stemSentence(query)) elif type == 1: return models.score_lmd(parser.stemSentence(query)) elif type == 2: return models.score_lmjm(parser.stemSentence(query)) else: return models.scoreRM3(parser.stemSentence(query))
def __init__(self, bigrams): ### 1. Load the corpus cranfield = collectionloaders.CranfieldTestBed() ### 2. Parse the corpus # Tokenize, stem and remove stop words if not bigrams: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) ### 3. Create the model # Compute the term frequencies matrix and the model statistics tf_cranfield = vectorizer.fit_transform(corpus).toarray() index = 0 self.map_val = [] lambs = np.arange(0, 1, 0.1) # For each value in the lambs array it will compute the MAP # After going through all the values in the lambs array # it will present a plot with the variation of the MAP throughout the lamb values for lamb in lambs: models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer, lamb) i = 1 map_model = 0 for query in cranfield.queries: # Parse the query and compute the document scores scores = models.score_lmjm(parser.stemSentence(query)) # Do the evaluation [average_precision, precision, self.recall, thresholds] = cranfield.eval(scores, i) map_model = map_model + average_precision i = i + 1 self.map_val.append(map_model / cranfield.num_queries) index = index + 1 plt.plot(lambs, self.map_val, color='b', alpha=1) plt.ylim([0.0, 0.5]) plt.xlim([0.0, 1.0]) plt.xlabel('Lambda') plt.ylabel('MAP') plt.title('MAP-Lambda') plt.savefig('results/map-lamb.png', dpi=100) plt.show()
def vsm(vectorizer, cl, verbose): plt.clf() corpus = parser.stemCorpus(cl.corpus_cranfield['abstract']) tf_cranfield = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer) i = 1 map_vsm = 0 p10aux = 0 precision_vsm = [] recallarr = [] for query in cl.queries_cranfield['query']: scores = models.score_vsm(parser.stemSentence(query)) [average_precision, precision_11point, recall_11point, p10] = cl.eval(scores, i) map_vsm = map_vsm + average_precision p10aux = p10aux + p10 precision_vsm.append(average_precision) recallarr.append(recall_11point) if verbose: plt.plot(recall_11point, precision_11point, color='silver', alpha=0.1) print('qid =', i, 'VSM AP=', average_precision) i = i + 1 map_vsm = map_vsm / cl.num_queries p10aux = p10aux / cl.num_queries plt.plot(recall_11point, precision_11point, color='b', alpha=1) plt.gca().set_aspect('equal', adjustable='box') plt.fill_between( recall_11point, np.mean(precision_vsm, axis=0) - np.std(precision_vsm, axis=0), np.mean(precision_vsm, axis=0) + np.std(precision_vsm, axis=0), facecolor='b', alpha=0.1) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_vsm)) plt.savefig('results/VSMResult.png', dpi=100) finalres = [map_vsm, p10aux] return finalres
def lmjm(vectorizer, cl, verbose, lmbd): plt.clf() corpus = parser.stemCorpus(cl.corpus_cranfield['abstract']) tf_cranfiled = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfiled, vectorizer) scores_array = [] p10aux = 0 map_lmjm = 0 j = 1 for query in cl.queries_cranfield['query']: score = models.score_lmjm(parser.stemSentence(query), lmbd) [average_precision, precision_11point, recall_11point, p10] = cl.eval(score, j) map_lmjm = map_lmjm + average_precision p10aux = p10aux + p10 scores_array.append(average_precision) if verbose: plt.plot(recall_11point, precision_11point, color='silver', alpha=0.1) print('qid =', j, 'LMJM AP=', average_precision) j = j + 1 map_lmjm = map_lmjm / cl.num_queries p10aux = p10aux / cl.num_queries plt.plot(recall_11point, precision_11point, color='b', alpha=1) plt.gca().set_aspect('equal', adjustable='box') plt.fill_between( recall_11point, np.mean(scores_array, axis=0) - np.std(scores_array, axis=0), np.mean(scores_array, axis=0) + np.std(scores_array, axis=0), facecolor='b', alpha=0.1) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_lmjm)) plt.savefig('results/LMJMResult.png', dpi=100) finalres = [map_lmjm, p10aux] return finalres
def __init__(self, bigrams): ### 1. Load the corpus cranfield = collectionloaders.CranfieldTestBed() ### 2. Parse the corpus # Tokenize, stem and remove stop words if not bigrams: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) ### 3. Create the model # Compute the term frequencies matrix and the model statistics tf_cranfield = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer, 0.5, 250) ### 4. Run the queries over the corpus i = 1 self.p10_model = 0 self.precision_model = [] for query in cranfield.queries: # Parse the query and compute the document scores scores = models.score_lmd(parser.stemSentence(query)) # Do the evaluation [average_precision, precision, self.recall, p10] = cranfield.eval(scores, i) # Sums all the p10 values obtained in the different queries self.p10_model = self.p10_model + p10 self.precision_model.append(precision) i = i + 1 # Computes the mean value of P10 and present it self.p10_model = self.p10_model / cranfield.num_queries print('\nP10 =', self.p10_model)
def bm25(vectorizer, cl, verbose, k1, b): corpus = parser.stemCorpus(cl.corpus_cranfield['abstract']) tf_cranfield = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer) i = 1 map_bm25 = 0 precision_bm25 = [] for query in cl.queries_cranfield['query']: scores = models.score_bm25(parser.stemSentence(query), k1, b) [average_precision, precision_11point, recall_11point, p10] = cl.eval(scores, i) map_bm25 = map_bm25 + average_precision precision_bm25.append(average_precision) if verbose: plt.plot(recall_11point, precision_11point, color='silver', alpha=0.1) print('qid =', i, 'BM25 AP=', average_precision) i = i + 1 map_bm25 = map_bm25 / cl.num_queries plt.plot(recall_11point, precision_11point, color='b', alpha=1) plt.gca().set_aspect('equal', adjustable='box') plt.fill_between( recall_11point, np.mean(precision_bm25, axis=0) - np.std(precision_bm25, axis=0), np.mean(precision_bm25, axis=0) + np.std(precision_bm25, axis=0), facecolor='b', alpha=0.1) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_bm25)) plt.savefig('results/bm25test.png', dpi=100) finalres = [precision_bm25, recall_11point, map_bm25] return finalres
def __init__(self, bigrams): ### 1. Load the corpus cranfield = collectionloaders.CranfieldTestBed() ### 2. Parse the corpus # Tokenize, stem and remove stop words if not bigrams: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) ### 3. Create the model # Compute the term frequencies matrix and the model statistics tf_cranfield = vectorizer.fit_transform(corpus).toarray() colors = ['green', 'red', 'blue'] limits = [-3, -5, -10] # For each defined limit it will compute the MAP variation # given a specific range of alpha values for k in limits: index = 0 self.map_val = [] alphas = np.arange(0, 1, 0.1) for alpha in alphas: models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer, alpha) i = 1 map_model = 0 for query in cranfield.queries: # Parse the query and compute the document scores scores = models.scoreRM3(parser.stemSentence(query), k, alpha) # Do the evaluation [average_precision, precision, self.recall, thresholds] = cranfield.eval(scores, i) # Compute the words that were considered relevant in this query words = self.show_query_terms(vectorizer, models) print('\nalpha:', alpha, ', limit:', abs(k), '\n', words) map_model = map_model + average_precision i = i + 1 self.map_val.append(map_model / cranfield.num_queries) index = index + 1 # Creates the plot that will show the MAP variation with the different alpha values plt.plot(alphas, self.map_val, color=colors[limits.index(k)], alpha=1, label='limit = ' + str(abs(limits[limits.index(k)]))) plt.legend(loc='upper left') plt.ylim([0.0, 0.5]) plt.xlim([0.0, 1.0]) plt.xlabel('Alpha') plt.ylabel('MAP') plt.title('MAP-Alpha') plt.savefig('results/map-alpha.png', dpi=100) plt.show()