コード例 #1
0
    def test_bulid_word_index(self):
        concepts_list = [
            Concept(0, 'title0', ['a', 'b', 'c']),
            Concept(1, 'title1', ['b', 'c']),
            Concept(2, 'title2', ['x', 'c']),
        ]

        expected = ['a', 'b', 'c', 'x']
        actual = dbb.build_word_index(concepts_list)

        self.assertEqual(set(expected), set(actual))
コード例 #2
0
 def test_bulid_word_index(self):
     concepts_list = [
         Concept(0, 'title0', ['a', 'b', 'c']),
         Concept(1, 'title1', ['b', 'c']),
         Concept(2, 'title2', ['x', 'c']),
         ]
     
     expected =['a', 'b', 'c', 'x']
     actual = dbb.build_word_index(concepts_list)
     
     self.assertEqual(set(expected), set(actual))    
コード例 #3
0
ファイル: db_builder.py プロジェクト: gzvulon/WikiRep
 def build(self, wf=None, normalization=True):
     ''' Builds DatabaseWrapper according to algorithm
         @param wf: workflow for debug purpuses
         @returns: DatabaseWrapper
     '''
     _log.info("Start building inverted index")
     _log.info("Normalization={}".format(normalization))
     
     
     _log.info("Building word index")        
     #unique enumeration of words (list of words and index is a posiioin of the word in list)
     self.word_index = build_word_index(self.concepts_list)
     _log.info("Number of terms={}".format(len(self.word_index)))
     _log.info("Number of concepts={}".format(len(self.concepts_list)))       
     
     #word => index in word_index
     index_by_word = build_index_by_words(self.word_index)
     
     # docs per word
     df_vec = build_df(index_by_word, self.concepts_list)
     _log.info("DF vector build is DONE")
                 
     # weight table not normalized
     T = build_wieght_table_dok(df_vec, index_by_word, self.concepts_list)
     _log.info("ID-TDF vector build is DONE")
             
     if normalization:
         normalize(T)
         _log.info("Normalization is DONE")
     
     db = DatabaseWrapper(T, self.concepts_list, self.word_index, self.stemmer)
     _log.info("Database wrapper created")
         
     if wf: 
         wf.word_index = self.word_index
         #workaround to force returned wf to be sparse
         wf.df_vec = matrix(df_vec)
         wf.wieghts_mat = T
     return db 
コード例 #4
0
ファイル: db_builder.py プロジェクト: roeiba/WikiRep
    def build(self, wf=None, normalization=True):
        ''' Builds DatabaseWrapper according to algorithm
            @param wf: workflow for debug purpuses
            @returns: DatabaseWrapper
        '''
        _log.info("Start building inverted index")
        _log.info("Normalization={}".format(normalization))

        _log.info("Building word index")
        #unique enumeration of words (list of words and index is a posiioin of the word in list)
        self.word_index = build_word_index(self.concepts_list)
        _log.info("Number of terms={}".format(len(self.word_index)))
        _log.info("Number of concepts={}".format(len(self.concepts_list)))

        #word => index in word_index
        index_by_word = build_index_by_words(self.word_index)

        # docs per word
        df_vec = build_df(index_by_word, self.concepts_list)
        _log.info("DF vector build is DONE")

        # weight table not normalized
        T = build_wieght_table_dok(df_vec, index_by_word, self.concepts_list)
        _log.info("ID-TDF vector build is DONE")

        if normalization:
            normalize(T)
            _log.info("Normalization is DONE")

        db = DatabaseWrapper(T, self.concepts_list, self.word_index,
                             self.stemmer)
        _log.info("Database wrapper created")

        if wf:
            wf.word_index = self.word_index
            #workaround to force returned wf to be sparse
            wf.df_vec = matrix(df_vec)
            wf.wieghts_mat = T
        return db
コード例 #5
0
ファイル: db_builder.py プロジェクト: gzvulon/WikiRep
 def get_word_index(self):
     if self.word_index is None:
         return build_word_index(self.concepts_list)
     return self.word_index
コード例 #6
0
    def test_build_word_index_empty(self):
        expected = []
        actual = dbb.build_word_index([])

        self.assertEqual(expected, actual)
コード例 #7
0
 def test_build_word_index_empty(self):
     expected =[]
     actual = dbb.build_word_index([])
     
     self.assertEqual(expected, actual) 
コード例 #8
0
ファイル: db_builder.py プロジェクト: roeiba/WikiRep
 def get_word_index(self):
     if self.word_index is None:
         return build_word_index(self.concepts_list)
     return self.word_index