Example #1
0
    def __init__(self, bigrams):

        ### 1. Load the corpus
        cranfield = collectionloaders.CranfieldTestBed()

        ### 2. Parse the corpus
        # Tokenize, stem and remove stop words
        if not bigrams:
            vectorizer = CountVectorizer()
        else:
            vectorizer = CountVectorizer(ngram_range=(1, 2),
                                         token_pattern=r'\b\w+\b',
                                         min_df=1)

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        ### 3. Create the model
        # Compute the term frequencies matrix and the model statistics
        tf_cranfield = vectorizer.fit_transform(corpus).toarray()

        index = 0
        self.map_val = []
        lambs = np.arange(0, 1, 0.1)

        # For each value in the lambs array it will compute the MAP
        # After going through all the values in the lambs array
        # it will present a plot with the variation of the MAP throughout the lamb values
        for lamb in lambs:
            models = RetrievalModelsMatrix.RetrievalModelsMatrix(
                tf_cranfield, vectorizer, lamb)

            i = 1
            map_model = 0
            for query in cranfield.queries:
                # Parse the query and compute the document scores
                scores = models.score_lmjm(parser.stemSentence(query))

                # Do the evaluation
                [average_precision, precision, self.recall,
                 thresholds] = cranfield.eval(scores, i)

                map_model = map_model + average_precision
                i = i + 1

            self.map_val.append(map_model / cranfield.num_queries)
            index = index + 1

        plt.plot(lambs, self.map_val, color='b', alpha=1)
        plt.ylim([0.0, 0.5])
        plt.xlim([0.0, 1.0])
        plt.xlabel('Lambda')
        plt.ylabel('MAP')
        plt.title('MAP-Lambda')
        plt.savefig('results/map-lamb.png', dpi=100)
        plt.show()
Example #2
0
    def __init__(self, bigrams):

        ### 1. Load the corpus
        cranfield = collectionloaders.CranfieldTestBed()

        ### 2. Parse the corpus
        # Tokenize, stem and remove stop words
        if not bigrams:
            vectorizer = CountVectorizer()
        else:
            vectorizer = CountVectorizer(ngram_range=(1, 2),
                                         token_pattern=r'\b\w+\b',
                                         min_df=1)

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        ### 3. Create the model
        # Compute the term frequencies matrix and the model statistics
        tf_cranfield = vectorizer.fit_transform(corpus).toarray()
        models = RetrievalModelsMatrix.RetrievalModelsMatrix(
            tf_cranfield, vectorizer, 0.5, 250)

        ### 4. Run the queries over the corpus
        i = 1
        self.p10_model = 0
        self.precision_model = []

        for query in cranfield.queries:
            # Parse the query and compute the document scores
            scores = models.score_lmd(parser.stemSentence(query))

            # Do the evaluation
            [average_precision, precision, self.recall,
             p10] = cranfield.eval(scores, i)

            # Sums all the p10 values obtained in the different queries
            self.p10_model = self.p10_model + p10
            self.precision_model.append(precision)

            i = i + 1

        # Computes the mean value of P10 and present it
        self.p10_model = self.p10_model / cranfield.num_queries
        print('\nP10 =', self.p10_model)
Example #3
0
    def __init__(self, bigrams):

        ### 1. Load the corpus
        cranfield = collectionloaders.CranfieldTestBed()

        ### 2. Parse the corpus
        # Tokenize, stem and remove stop words
        if not bigrams:
            vectorizer = CountVectorizer()
        else:
            vectorizer = CountVectorizer(ngram_range=(1, 2),
                                         token_pattern=r'\b\w+\b',
                                         min_df=1)

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        ### 3. Create the model
        # Compute the term frequencies matrix and the model statistics
        tf_cranfield = vectorizer.fit_transform(corpus).toarray()

        colors = ['green', 'red', 'blue']
        limits = [-3, -5, -10]

        # For each defined limit it will compute the MAP variation
        # given a specific range of alpha values
        for k in limits:
            index = 0
            self.map_val = []
            alphas = np.arange(0, 1, 0.1)
            for alpha in alphas:
                models = RetrievalModelsMatrix.RetrievalModelsMatrix(
                    tf_cranfield, vectorizer, alpha)

                i = 1
                map_model = 0
                for query in cranfield.queries:
                    # Parse the query and compute the document scores
                    scores = models.scoreRM3(parser.stemSentence(query), k,
                                             alpha)

                    # Do the evaluation
                    [average_precision, precision, self.recall,
                     thresholds] = cranfield.eval(scores, i)

                    # Compute the words that were considered relevant in this query
                    words = self.show_query_terms(vectorizer, models)
                    print('\nalpha:', alpha, ', limit:', abs(k), '\n', words)

                    map_model = map_model + average_precision
                    i = i + 1

                self.map_val.append(map_model / cranfield.num_queries)
                index = index + 1

            # Creates the plot that will show the MAP variation with the different alpha values
            plt.plot(alphas,
                     self.map_val,
                     color=colors[limits.index(k)],
                     alpha=1,
                     label='limit = ' + str(abs(limits[limits.index(k)])))

        plt.legend(loc='upper left')
        plt.ylim([0.0, 0.5])
        plt.xlim([0.0, 1.0])
        plt.xlabel('Alpha')
        plt.ylabel('MAP')
        plt.title('MAP-Alpha')
        plt.savefig('results/map-alpha.png', dpi=100)
        plt.show()
Example #4
0
    def __init__(self, model_type):

        # Names from the models the program can compute
        models_names = ["vsm", "lmd", "lmjm", "rm3"]
        cranfield = collectionloaders.CranfieldTestBed()

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        colors = ['b', 'r']
        labels = ['unigram', 'bigram']

        for k in range(0, 2):

            # Depending on the k variable the "vectorizer" will use unigrams or bigrams
            if k == 0:
                vectorizer = CountVectorizer()
            else:
                vectorizer = CountVectorizer(ngram_range=(1, 2),
                                             token_pattern=r'\b\w+\b',
                                             min_df=1)

            tf_cranfield = vectorizer.fit_transform(corpus).toarray()
            models = RetrievalModelsMatrix.RetrievalModelsMatrix(
                tf_cranfield, vectorizer)

            i = 1
            self.map_model = 0
            self.precision_model = []

            # Goes through all the queries and computes the MAP and mean value of the precision
            for query in cranfield.queries:
                scores = self.compute_score(models, model_type, query)

                [average_precision, precision, self.recall,
                 thresholds] = cranfield.eval(scores, i)

                self.map_model = self.map_model + average_precision
                self.precision_model.append(precision)
                i = i + 1

            self.map_model = self.map_model / cranfield.num_queries
            mean_precision = np.mean(self.precision_model, axis=0)

            # Draws in the plot the Precision-Recall relation using the color in the "colors" array at the position k
            # I will also give a label to that line which is in the labels array at the position k
            plt.plot(self.recall,
                     mean_precision,
                     color=colors[k],
                     alpha=1,
                     label=labels[k])
            plt.gca().set_aspect('equal', adjustable='box')

        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.0])
        plt.xlim([0.0, 1.0])

        # Places the legend at the top left corner of the plot
        plt.legend(loc='upper left')
        plt.title('Precision-Recall (' + models_names[model_type].upper() +
                  ')')
        plt.savefig('results/uni-bi-' + models_names[model_type] + '.png',
                    dpi=100)
Example #5
0
    def __init__(self, bigrams, model_type, is_sw=0.05, is_ba=0.95):

        ### 1. Load the corpus
        cranfield = collectionloaders.CranfieldTestBed()

        ### 2. Parse the corpus
        # Tokenize, stem and remove stop words
        if not bigrams:
            vectorizer = CountVectorizer()
        else:
            vectorizer = CountVectorizer(ngram_range=(1, 2),
                                         token_pattern=r'\b\w+\b',
                                         min_df=1)

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        ### 3. Create the model
        # Compute the term frequencies matrix and the model statistics
        tf_cranfield = vectorizer.fit_transform(corpus).toarray()
        models = RetrievalModelsMatrix.RetrievalModelsMatrix(
            tf_cranfield, vectorizer, 0.5, 250)

        ### 4. Run the queries over the corpus
        i = 1
        self.map_model = 0
        self.precision_model = []
        self.ap_below = 0
        self.better_query = []
        self.worse_query = []

        plt.figure(1)
        for query in cranfield.queries:
            # Parse the query and compute the document scores
            scores = self.compute_score(models, model_type, query)

            # Do the evaluation
            [average_precision, precision, self.recall,
             thresholds] = cranfield.eval(scores, i)

            # If the computed average precision of the query is below a static value (0.05) it will be presented
            if is_sw > average_precision:
                #print('qid =', i, ' AP=', average_precision)
                self.ap_below = self.ap_below + average_precision
                worse_query.append(i)

            if is_ba >= average_precision:
                better_query.append(i)

            # Sums all the average_precision values obtained in the different queries
            self.map_model = self.map_model + average_precision
            self.precision_model.append(precision)
            plt.plot(self.recall, precision, color='silver', alpha=0.1)

            i = i + 1

        # Computes the mean value of MAP and
        # the percentage of queries that have an average precision below a static value
        self.map_model = self.map_model / cranfield.num_queries
        self.ap_below = (self.ap_below / cranfield.num_queries) * 100

        print('model ', model_type, ' done.')
        print('MAP = ', self.map_model)
        facecolor='b',
        alpha=0.1)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_vsm))
    plt.savefig('results/vsmtest.png', dpi=100)

    finalres = [precision_vsm, precision_11point, map_vsm]
    return finalres


######################################################    Main Code    #############################################################################################################
cl = collectionloaders.CranfieldTestBed()
user_input = input("Command?")
verbose = True

while user_input.lower() != "q":
    if user_input.lower() == "mm":
        user_model_option = input("model : ex. vsm/lmjm/lmd/bm25")
        user_input = input("Number of N-grams? \t ex: 1 2 3")
        result = inputParser(user_input)
        if result == 0:
            vectorizer = CountVectorizer(ngram_range=(1, int(num) + 1),
                                         token_pattern=r'\b\w+\b',
                                         min_df=1,
                                         stop_words='english')
        else:
            vsmArray = []