def build(self, wf=None, normalization=True): ''' Builds DatabaseWrapper according to algorithm @param wf: workflow for debug purpuses @returns: DatabaseWrapper ''' _log.info("Start building inverted index") _log.info("Normalization={}".format(normalization)) _log.info("Building word index") #unique enumeration of words (list of words and index is a posiioin of the word in list) self.word_index = build_word_index(self.concepts_list) _log.info("Number of terms={}".format(len(self.word_index))) _log.info("Number of concepts={}".format(len(self.concepts_list))) #word => index in word_index index_by_word = build_index_by_words(self.word_index) # docs per word df_vec = build_df(index_by_word, self.concepts_list) _log.info("DF vector build is DONE") # weight table not normalized T = build_wieght_table_dok(df_vec, index_by_word, self.concepts_list) _log.info("ID-TDF vector build is DONE") if normalization: normalize(T) _log.info("Normalization is DONE") db = DatabaseWrapper(T, self.concepts_list, self.word_index, self.stemmer) _log.info("Database wrapper created") if wf: wf.word_index = self.word_index #workaround to force returned wf to be sparse wf.df_vec = matrix(df_vec) wf.wieghts_mat = T return db
def test_build_index_by_wordsempty(self): word_list = [] expected = {} actual = dbb.build_index_by_words(word_list) self.assertEqual(expected, actual)
def test_build_index_by_words_not_empty(self): word_list = ['a', 'd', 'b'] expected = {'a': 0, 'd': 1, 'b': 2} actual = dbb.build_index_by_words(word_list) self.assertEqual(expected, actual)
def test_build_index_by_words_not_empty(self): word_list = ['a', 'd', 'b'] expected = { 'a': 0, 'd':1, 'b':2} actual = dbb.build_index_by_words(word_list) self.assertEqual(expected, actual)