Python stemCorpusの例、simpleparser.stemCorpus Pythonの例

コード例 #1

0

ファイルを表示

    def __init__(self, bigrams):

        ### 1. Load the corpus
        cranfield = collectionloaders.CranfieldTestBed()

        ### 2. Parse the corpus
        # Tokenize, stem and remove stop words
        if not bigrams:
            vectorizer = CountVectorizer()
        else:
            vectorizer = CountVectorizer(ngram_range=(1, 2),
                                         token_pattern=r'\b\w+\b',
                                         min_df=1)

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        ### 3. Create the model
        # Compute the term frequencies matrix and the model statistics
        tf_cranfield = vectorizer.fit_transform(corpus).toarray()

        index = 0
        self.map_val = []
        lambs = np.arange(0, 1, 0.1)

        # For each value in the lambs array it will compute the MAP
        # After going through all the values in the lambs array
        # it will present a plot with the variation of the MAP throughout the lamb values
        for lamb in lambs:
            models = RetrievalModelsMatrix.RetrievalModelsMatrix(
                tf_cranfield, vectorizer, lamb)

            i = 1
            map_model = 0
            for query in cranfield.queries:
                # Parse the query and compute the document scores
                scores = models.score_lmjm(parser.stemSentence(query))

                # Do the evaluation
                [average_precision, precision, self.recall,
                 thresholds] = cranfield.eval(scores, i)

                map_model = map_model + average_precision
                i = i + 1

            self.map_val.append(map_model / cranfield.num_queries)
            index = index + 1

        plt.plot(lambs, self.map_val, color='b', alpha=1)
        plt.ylim([0.0, 0.5])
        plt.xlim([0.0, 1.0])
        plt.xlabel('Lambda')
        plt.ylabel('MAP')
        plt.title('MAP-Lambda')
        plt.savefig('results/map-lamb.png', dpi=100)
        plt.show()

コード例 #2

0

ファイルを表示

def vsm(vectorizer, cl, verbose):
    plt.clf()
    corpus = parser.stemCorpus(cl.corpus_cranfield['abstract'])
    tf_cranfield = vectorizer.fit_transform(corpus).toarray()
    models = RetrievalModelsMatrix.RetrievalModelsMatrix(
        tf_cranfield, vectorizer)
    i = 1
    map_vsm = 0
    p10aux = 0
    precision_vsm = []
    recallarr = []
    for query in cl.queries_cranfield['query']:
        scores = models.score_vsm(parser.stemSentence(query))
        [average_precision, precision_11point, recall_11point,
         p10] = cl.eval(scores, i)
        map_vsm = map_vsm + average_precision
        p10aux = p10aux + p10
        precision_vsm.append(average_precision)
        recallarr.append(recall_11point)

        if verbose:
            plt.plot(recall_11point,
                     precision_11point,
                     color='silver',
                     alpha=0.1)
            print('qid =', i, 'VSM     AP=', average_precision)
        i = i + 1

    map_vsm = map_vsm / cl.num_queries
    p10aux = p10aux / cl.num_queries

    plt.plot(recall_11point, precision_11point, color='b', alpha=1)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.fill_between(
        recall_11point,
        np.mean(precision_vsm, axis=0) - np.std(precision_vsm, axis=0),
        np.mean(precision_vsm, axis=0) + np.std(precision_vsm, axis=0),
        facecolor='b',
        alpha=0.1)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_vsm))
    plt.savefig('results/VSMResult.png', dpi=100)

    finalres = [map_vsm, p10aux]
    return finalres

コード例 #3

0

ファイルを表示

def lmjm(vectorizer, cl, verbose, lmbd):
    plt.clf()
    corpus = parser.stemCorpus(cl.corpus_cranfield['abstract'])
    tf_cranfiled = vectorizer.fit_transform(corpus).toarray()
    models = RetrievalModelsMatrix.RetrievalModelsMatrix(
        tf_cranfiled, vectorizer)
    scores_array = []
    p10aux = 0
    map_lmjm = 0
    j = 1
    for query in cl.queries_cranfield['query']:
        score = models.score_lmjm(parser.stemSentence(query), lmbd)
        [average_precision, precision_11point, recall_11point,
         p10] = cl.eval(score, j)
        map_lmjm = map_lmjm + average_precision
        p10aux = p10aux + p10
        scores_array.append(average_precision)
        if verbose:
            plt.plot(recall_11point,
                     precision_11point,
                     color='silver',
                     alpha=0.1)
            print('qid =', j, 'LMJM     AP=', average_precision)
        j = j + 1
    map_lmjm = map_lmjm / cl.num_queries
    p10aux = p10aux / cl.num_queries

    plt.plot(recall_11point, precision_11point, color='b', alpha=1)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.fill_between(
        recall_11point,
        np.mean(scores_array, axis=0) - np.std(scores_array, axis=0),
        np.mean(scores_array, axis=0) + np.std(scores_array, axis=0),
        facecolor='b',
        alpha=0.1)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_lmjm))
    plt.savefig('results/LMJMResult.png', dpi=100)

    finalres = [map_lmjm, p10aux]
    return finalres

コード例 #4

0

ファイルを表示

    def __init__(self, bigrams):

        ### 1. Load the corpus
        cranfield = collectionloaders.CranfieldTestBed()

        ### 2. Parse the corpus
        # Tokenize, stem and remove stop words
        if not bigrams:
            vectorizer = CountVectorizer()
        else:
            vectorizer = CountVectorizer(ngram_range=(1, 2),
                                         token_pattern=r'\b\w+\b',
                                         min_df=1)

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        ### 3. Create the model
        # Compute the term frequencies matrix and the model statistics
        tf_cranfield = vectorizer.fit_transform(corpus).toarray()
        models = RetrievalModelsMatrix.RetrievalModelsMatrix(
            tf_cranfield, vectorizer, 0.5, 250)

        ### 4. Run the queries over the corpus
        i = 1
        self.p10_model = 0
        self.precision_model = []

        for query in cranfield.queries:
            # Parse the query and compute the document scores
            scores = models.score_lmd(parser.stemSentence(query))

            # Do the evaluation
            [average_precision, precision, self.recall,
             p10] = cranfield.eval(scores, i)

            # Sums all the p10 values obtained in the different queries
            self.p10_model = self.p10_model + p10
            self.precision_model.append(precision)

            i = i + 1

        # Computes the mean value of P10 and present it
        self.p10_model = self.p10_model / cranfield.num_queries
        print('\nP10 =', self.p10_model)

コード例 #5

0

ファイルを表示

ファイル: MainModule.py プロジェクト: salamander116/Information-Retrieval-2019---MiniProject---evaluation

def bm25(vectorizer, cl, verbose, k1, b):
    corpus = parser.stemCorpus(cl.corpus_cranfield['abstract'])
    tf_cranfield = vectorizer.fit_transform(corpus).toarray()
    models = RetrievalModelsMatrix.RetrievalModelsMatrix(
        tf_cranfield, vectorizer)
    i = 1
    map_bm25 = 0
    precision_bm25 = []
    for query in cl.queries_cranfield['query']:
        scores = models.score_bm25(parser.stemSentence(query), k1, b)
        [average_precision, precision_11point, recall_11point,
         p10] = cl.eval(scores, i)
        map_bm25 = map_bm25 + average_precision
        precision_bm25.append(average_precision)

        if verbose:
            plt.plot(recall_11point,
                     precision_11point,
                     color='silver',
                     alpha=0.1)
            print('qid =', i, 'BM25    AP=', average_precision)
        i = i + 1
    map_bm25 = map_bm25 / cl.num_queries

    plt.plot(recall_11point, precision_11point, color='b', alpha=1)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.fill_between(
        recall_11point,
        np.mean(precision_bm25, axis=0) - np.std(precision_bm25, axis=0),
        np.mean(precision_bm25, axis=0) + np.std(precision_bm25, axis=0),
        facecolor='b',
        alpha=0.1)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_bm25))
    plt.savefig('results/bm25test.png', dpi=100)

    finalres = [precision_bm25, recall_11point, map_bm25]
    return finalres

コード例 #6

0

ファイルを表示

    def __init__(self, bigrams):

        ### 1. Load the corpus
        cranfield = collectionloaders.CranfieldTestBed()

        ### 2. Parse the corpus
        # Tokenize, stem and remove stop words
        if not bigrams:
            vectorizer = CountVectorizer()
        else:
            vectorizer = CountVectorizer(ngram_range=(1, 2),
                                         token_pattern=r'\b\w+\b',
                                         min_df=1)

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        ### 3. Create the model
        # Compute the term frequencies matrix and the model statistics
        tf_cranfield = vectorizer.fit_transform(corpus).toarray()

        colors = ['green', 'red', 'blue']
        limits = [-3, -5, -10]

        # For each defined limit it will compute the MAP variation
        # given a specific range of alpha values
        for k in limits:
            index = 0
            self.map_val = []
            alphas = np.arange(0, 1, 0.1)
            for alpha in alphas:
                models = RetrievalModelsMatrix.RetrievalModelsMatrix(
                    tf_cranfield, vectorizer, alpha)

                i = 1
                map_model = 0
                for query in cranfield.queries:
                    # Parse the query and compute the document scores
                    scores = models.scoreRM3(parser.stemSentence(query), k,
                                             alpha)

                    # Do the evaluation
                    [average_precision, precision, self.recall,
                     thresholds] = cranfield.eval(scores, i)

                    # Compute the words that were considered relevant in this query
                    words = self.show_query_terms(vectorizer, models)
                    print('\nalpha:', alpha, ', limit:', abs(k), '\n', words)

                    map_model = map_model + average_precision
                    i = i + 1

                self.map_val.append(map_model / cranfield.num_queries)
                index = index + 1

            # Creates the plot that will show the MAP variation with the different alpha values
            plt.plot(alphas,
                     self.map_val,
                     color=colors[limits.index(k)],
                     alpha=1,
                     label='limit = ' + str(abs(limits[limits.index(k)])))

        plt.legend(loc='upper left')
        plt.ylim([0.0, 0.5])
        plt.xlim([0.0, 1.0])
        plt.xlabel('Alpha')
        plt.ylabel('MAP')
        plt.title('MAP-Alpha')
        plt.savefig('results/map-alpha.png', dpi=100)
        plt.show()

コード例 #7

0

ファイルを表示

    def __init__(self, model_type):

        # Names from the models the program can compute
        models_names = ["vsm", "lmd", "lmjm", "rm3"]
        cranfield = collectionloaders.CranfieldTestBed()

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        colors = ['b', 'r']
        labels = ['unigram', 'bigram']

        for k in range(0, 2):

            # Depending on the k variable the "vectorizer" will use unigrams or bigrams
            if k == 0:
                vectorizer = CountVectorizer()
            else:
                vectorizer = CountVectorizer(ngram_range=(1, 2),
                                             token_pattern=r'\b\w+\b',
                                             min_df=1)

            tf_cranfield = vectorizer.fit_transform(corpus).toarray()
            models = RetrievalModelsMatrix.RetrievalModelsMatrix(
                tf_cranfield, vectorizer)

            i = 1
            self.map_model = 0
            self.precision_model = []

            # Goes through all the queries and computes the MAP and mean value of the precision
            for query in cranfield.queries:
                scores = self.compute_score(models, model_type, query)

                [average_precision, precision, self.recall,
                 thresholds] = cranfield.eval(scores, i)

                self.map_model = self.map_model + average_precision
                self.precision_model.append(precision)
                i = i + 1

            self.map_model = self.map_model / cranfield.num_queries
            mean_precision = np.mean(self.precision_model, axis=0)

            # Draws in the plot the Precision-Recall relation using the color in the "colors" array at the position k
            # I will also give a label to that line which is in the labels array at the position k
            plt.plot(self.recall,
                     mean_precision,
                     color=colors[k],
                     alpha=1,
                     label=labels[k])
            plt.gca().set_aspect('equal', adjustable='box')

        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.0])
        plt.xlim([0.0, 1.0])

        # Places the legend at the top left corner of the plot
        plt.legend(loc='upper left')
        plt.title('Precision-Recall (' + models_names[model_type].upper() +
                  ')')
        plt.savefig('results/uni-bi-' + models_names[model_type] + '.png',
                    dpi=100)

コード例 #8

0

ファイルを表示

    def __init__(self, bigrams, model_type, is_sw=0.05, is_ba=0.95):

        ### 1. Load the corpus
        cranfield = collectionloaders.CranfieldTestBed()

        ### 2. Parse the corpus
        # Tokenize, stem and remove stop words
        if not bigrams:
            vectorizer = CountVectorizer()
        else:
            vectorizer = CountVectorizer(ngram_range=(1, 2),
                                         token_pattern=r'\b\w+\b',
                                         min_df=1)

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        ### 3. Create the model
        # Compute the term frequencies matrix and the model statistics
        tf_cranfield = vectorizer.fit_transform(corpus).toarray()
        models = RetrievalModelsMatrix.RetrievalModelsMatrix(
            tf_cranfield, vectorizer, 0.5, 250)

        ### 4. Run the queries over the corpus
        i = 1
        self.map_model = 0
        self.precision_model = []
        self.ap_below = 0
        self.better_query = []
        self.worse_query = []

        plt.figure(1)
        for query in cranfield.queries:
            # Parse the query and compute the document scores
            scores = self.compute_score(models, model_type, query)

            # Do the evaluation
            [average_precision, precision, self.recall,
             thresholds] = cranfield.eval(scores, i)

            # If the computed average precision of the query is below a static value (0.05) it will be presented
            if is_sw > average_precision:
                #print('qid =', i, ' AP=', average_precision)
                self.ap_below = self.ap_below + average_precision
                worse_query.append(i)

            if is_ba >= average_precision:
                better_query.append(i)

            # Sums all the average_precision values obtained in the different queries
            self.map_model = self.map_model + average_precision
            self.precision_model.append(precision)
            plt.plot(self.recall, precision, color='silver', alpha=0.1)

            i = i + 1

        # Computes the mean value of MAP and
        # the percentage of queries that have an average precision below a static value
        self.map_model = self.map_model / cranfield.num_queries
        self.ap_below = (self.ap_below / cranfield.num_queries) * 100

        print('model ', model_type, ' done.')
        print('MAP = ', self.map_model)