def __init__(self, bigrams): ### 1. Load the corpus cranfield = collectionloaders.CranfieldTestBed() ### 2. Parse the corpus # Tokenize, stem and remove stop words if not bigrams: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) ### 3. Create the model # Compute the term frequencies matrix and the model statistics tf_cranfield = vectorizer.fit_transform(corpus).toarray() index = 0 self.map_val = [] lambs = np.arange(0, 1, 0.1) # For each value in the lambs array it will compute the MAP # After going through all the values in the lambs array # it will present a plot with the variation of the MAP throughout the lamb values for lamb in lambs: models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer, lamb) i = 1 map_model = 0 for query in cranfield.queries: # Parse the query and compute the document scores scores = models.score_lmjm(parser.stemSentence(query)) # Do the evaluation [average_precision, precision, self.recall, thresholds] = cranfield.eval(scores, i) map_model = map_model + average_precision i = i + 1 self.map_val.append(map_model / cranfield.num_queries) index = index + 1 plt.plot(lambs, self.map_val, color='b', alpha=1) plt.ylim([0.0, 0.5]) plt.xlim([0.0, 1.0]) plt.xlabel('Lambda') plt.ylabel('MAP') plt.title('MAP-Lambda') plt.savefig('results/map-lamb.png', dpi=100) plt.show()
def vsm(vectorizer, cl, verbose): plt.clf() corpus = parser.stemCorpus(cl.corpus_cranfield['abstract']) tf_cranfield = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer) i = 1 map_vsm = 0 p10aux = 0 precision_vsm = [] recallarr = [] for query in cl.queries_cranfield['query']: scores = models.score_vsm(parser.stemSentence(query)) [average_precision, precision_11point, recall_11point, p10] = cl.eval(scores, i) map_vsm = map_vsm + average_precision p10aux = p10aux + p10 precision_vsm.append(average_precision) recallarr.append(recall_11point) if verbose: plt.plot(recall_11point, precision_11point, color='silver', alpha=0.1) print('qid =', i, 'VSM AP=', average_precision) i = i + 1 map_vsm = map_vsm / cl.num_queries p10aux = p10aux / cl.num_queries plt.plot(recall_11point, precision_11point, color='b', alpha=1) plt.gca().set_aspect('equal', adjustable='box') plt.fill_between( recall_11point, np.mean(precision_vsm, axis=0) - np.std(precision_vsm, axis=0), np.mean(precision_vsm, axis=0) + np.std(precision_vsm, axis=0), facecolor='b', alpha=0.1) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_vsm)) plt.savefig('results/VSMResult.png', dpi=100) finalres = [map_vsm, p10aux] return finalres
def lmjm(vectorizer, cl, verbose, lmbd): plt.clf() corpus = parser.stemCorpus(cl.corpus_cranfield['abstract']) tf_cranfiled = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfiled, vectorizer) scores_array = [] p10aux = 0 map_lmjm = 0 j = 1 for query in cl.queries_cranfield['query']: score = models.score_lmjm(parser.stemSentence(query), lmbd) [average_precision, precision_11point, recall_11point, p10] = cl.eval(score, j) map_lmjm = map_lmjm + average_precision p10aux = p10aux + p10 scores_array.append(average_precision) if verbose: plt.plot(recall_11point, precision_11point, color='silver', alpha=0.1) print('qid =', j, 'LMJM AP=', average_precision) j = j + 1 map_lmjm = map_lmjm / cl.num_queries p10aux = p10aux / cl.num_queries plt.plot(recall_11point, precision_11point, color='b', alpha=1) plt.gca().set_aspect('equal', adjustable='box') plt.fill_between( recall_11point, np.mean(scores_array, axis=0) - np.std(scores_array, axis=0), np.mean(scores_array, axis=0) + np.std(scores_array, axis=0), facecolor='b', alpha=0.1) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_lmjm)) plt.savefig('results/LMJMResult.png', dpi=100) finalres = [map_lmjm, p10aux] return finalres
def __init__(self, bigrams): ### 1. Load the corpus cranfield = collectionloaders.CranfieldTestBed() ### 2. Parse the corpus # Tokenize, stem and remove stop words if not bigrams: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) ### 3. Create the model # Compute the term frequencies matrix and the model statistics tf_cranfield = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer, 0.5, 250) ### 4. Run the queries over the corpus i = 1 self.p10_model = 0 self.precision_model = [] for query in cranfield.queries: # Parse the query and compute the document scores scores = models.score_lmd(parser.stemSentence(query)) # Do the evaluation [average_precision, precision, self.recall, p10] = cranfield.eval(scores, i) # Sums all the p10 values obtained in the different queries self.p10_model = self.p10_model + p10 self.precision_model.append(precision) i = i + 1 # Computes the mean value of P10 and present it self.p10_model = self.p10_model / cranfield.num_queries print('\nP10 =', self.p10_model)
def bm25(vectorizer, cl, verbose, k1, b): corpus = parser.stemCorpus(cl.corpus_cranfield['abstract']) tf_cranfield = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer) i = 1 map_bm25 = 0 precision_bm25 = [] for query in cl.queries_cranfield['query']: scores = models.score_bm25(parser.stemSentence(query), k1, b) [average_precision, precision_11point, recall_11point, p10] = cl.eval(scores, i) map_bm25 = map_bm25 + average_precision precision_bm25.append(average_precision) if verbose: plt.plot(recall_11point, precision_11point, color='silver', alpha=0.1) print('qid =', i, 'BM25 AP=', average_precision) i = i + 1 map_bm25 = map_bm25 / cl.num_queries plt.plot(recall_11point, precision_11point, color='b', alpha=1) plt.gca().set_aspect('equal', adjustable='box') plt.fill_between( recall_11point, np.mean(precision_bm25, axis=0) - np.std(precision_bm25, axis=0), np.mean(precision_bm25, axis=0) + np.std(precision_bm25, axis=0), facecolor='b', alpha=0.1) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_bm25)) plt.savefig('results/bm25test.png', dpi=100) finalres = [precision_bm25, recall_11point, map_bm25] return finalres
def __init__(self, bigrams): ### 1. Load the corpus cranfield = collectionloaders.CranfieldTestBed() ### 2. Parse the corpus # Tokenize, stem and remove stop words if not bigrams: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) ### 3. Create the model # Compute the term frequencies matrix and the model statistics tf_cranfield = vectorizer.fit_transform(corpus).toarray() colors = ['green', 'red', 'blue'] limits = [-3, -5, -10] # For each defined limit it will compute the MAP variation # given a specific range of alpha values for k in limits: index = 0 self.map_val = [] alphas = np.arange(0, 1, 0.1) for alpha in alphas: models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer, alpha) i = 1 map_model = 0 for query in cranfield.queries: # Parse the query and compute the document scores scores = models.scoreRM3(parser.stemSentence(query), k, alpha) # Do the evaluation [average_precision, precision, self.recall, thresholds] = cranfield.eval(scores, i) # Compute the words that were considered relevant in this query words = self.show_query_terms(vectorizer, models) print('\nalpha:', alpha, ', limit:', abs(k), '\n', words) map_model = map_model + average_precision i = i + 1 self.map_val.append(map_model / cranfield.num_queries) index = index + 1 # Creates the plot that will show the MAP variation with the different alpha values plt.plot(alphas, self.map_val, color=colors[limits.index(k)], alpha=1, label='limit = ' + str(abs(limits[limits.index(k)]))) plt.legend(loc='upper left') plt.ylim([0.0, 0.5]) plt.xlim([0.0, 1.0]) plt.xlabel('Alpha') plt.ylabel('MAP') plt.title('MAP-Alpha') plt.savefig('results/map-alpha.png', dpi=100) plt.show()
def __init__(self, model_type): # Names from the models the program can compute models_names = ["vsm", "lmd", "lmjm", "rm3"] cranfield = collectionloaders.CranfieldTestBed() corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) colors = ['b', 'r'] labels = ['unigram', 'bigram'] for k in range(0, 2): # Depending on the k variable the "vectorizer" will use unigrams or bigrams if k == 0: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) tf_cranfield = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer) i = 1 self.map_model = 0 self.precision_model = [] # Goes through all the queries and computes the MAP and mean value of the precision for query in cranfield.queries: scores = self.compute_score(models, model_type, query) [average_precision, precision, self.recall, thresholds] = cranfield.eval(scores, i) self.map_model = self.map_model + average_precision self.precision_model.append(precision) i = i + 1 self.map_model = self.map_model / cranfield.num_queries mean_precision = np.mean(self.precision_model, axis=0) # Draws in the plot the Precision-Recall relation using the color in the "colors" array at the position k # I will also give a label to that line which is in the labels array at the position k plt.plot(self.recall, mean_precision, color=colors[k], alpha=1, label=labels[k]) plt.gca().set_aspect('equal', adjustable='box') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) # Places the legend at the top left corner of the plot plt.legend(loc='upper left') plt.title('Precision-Recall (' + models_names[model_type].upper() + ')') plt.savefig('results/uni-bi-' + models_names[model_type] + '.png', dpi=100)
def __init__(self, bigrams, model_type, is_sw=0.05, is_ba=0.95): ### 1. Load the corpus cranfield = collectionloaders.CranfieldTestBed() ### 2. Parse the corpus # Tokenize, stem and remove stop words if not bigrams: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) ### 3. Create the model # Compute the term frequencies matrix and the model statistics tf_cranfield = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer, 0.5, 250) ### 4. Run the queries over the corpus i = 1 self.map_model = 0 self.precision_model = [] self.ap_below = 0 self.better_query = [] self.worse_query = [] plt.figure(1) for query in cranfield.queries: # Parse the query and compute the document scores scores = self.compute_score(models, model_type, query) # Do the evaluation [average_precision, precision, self.recall, thresholds] = cranfield.eval(scores, i) # If the computed average precision of the query is below a static value (0.05) it will be presented if is_sw > average_precision: #print('qid =', i, ' AP=', average_precision) self.ap_below = self.ap_below + average_precision worse_query.append(i) if is_ba >= average_precision: better_query.append(i) # Sums all the average_precision values obtained in the different queries self.map_model = self.map_model + average_precision self.precision_model.append(precision) plt.plot(self.recall, precision, color='silver', alpha=0.1) i = i + 1 # Computes the mean value of MAP and # the percentage of queries that have an average precision below a static value self.map_model = self.map_model / cranfield.num_queries self.ap_below = (self.ap_below / cranfield.num_queries) * 100 print('model ', model_type, ' done.') print('MAP = ', self.map_model)