def build_weight(self): n = V.shape[0] m = V.shape[1] self.weight = matrix((n, n)) self.weight_sum = matrix((n, n)) for f in self.F: for i in range(m): self.assign_weight(f[i], f[(i+1) % 3])
def test__centroid(self): vectors = [ matrix([0,0,8]), matrix([6,0,2]), matrix([3,0,-5]), ] expected_vec = matrix([3,0,5.0/3]) actual_vec = math_utils.get_vectors_centroid(vectors) numpy.testing.assert_array_almost_equal(expected_vec.todense(), actual_vec.todense(), err_msg="Centroid wrong calculations!")
def test__centroid(self): vectors = [ matrix([0, 0, 8]), matrix([6, 0, 2]), matrix([3, 0, -5]), ] expected_vec = matrix([3, 0, 5.0 / 3]) actual_vec = math_utils.get_vectors_centroid(vectors) numpy.testing.assert_array_almost_equal( expected_vec.todense(), actual_vec.todense(), err_msg="Centroid wrong calculations!")
def simple_wf(): wf = WorkFlow() # declare docs: wf.docs = [ Doc("Testing advanced 1", "a b c c c d d d d e"), Doc("Testing advanced 2", "a a a a a b c c c d d d e e"), Doc("Testing advanced 3", "b b b b f f f f"), ] # declare preprocessing wf.title_index = [doc.title for doc in wf.docs] wf.word_index = ['a', 'b', 'c', 'd', 'e', 'f'] # prepare tf matrix wf.count_matrix = array([ [1, 5, 0], #a [1, 1, 4], #b [3, 3, 0], #c [4, 3, 0], #d [1, 2, 0], #e [0, 0, 4], #f ]) count_to_tf = vectorize(math_utils.count_to_tf) wf.tf_mat = count_to_tf(wf.count_matrix) # prepare lidf vector: log of inverted df wf.n_docs = float(len(wf.docs)) wf.df_vec = array([2.0, 3.0, 2.0, 2.0, 2.0, 1.0]) wf.idf_vec = wf.n_docs / wf.df_vec wf.log_idf_vec = log(wf.idf_vec) wf.wieghts_mat = matrix(wf.tf_mat * wf.log_idf_vec[:, None]) return wf
def generate_Stream_GM(): sparse = matrix((Stream,GM),dtype=np.bool_) for i in tqdm(range(Stream)): connected = np.random.randint(low=0, high=GM, size=20) for j in connected: sparse[i,j]=1 return sparse
def simple_wf(): wf = WorkFlow() # declare docs: wf.docs = [ Doc("Testing advanced 1", "a b c c c d d d d e"), Doc("Testing advanced 2", "a a a a a b c c c d d d e e"), Doc("Testing advanced 3", "b b b b f f f f"), ] # declare preprocessing wf.title_index = [ doc.title for doc in wf.docs] wf.word_index = ['a', 'b', 'c', 'd', 'e', 'f'] # prepare tf matrix wf.count_matrix = array([ [1, 5, 0], #a [1, 1, 4], #b [3, 3, 0], #c [4, 3, 0], #d [1, 2, 0], #e [0, 0, 4], #f ]) count_to_tf = vectorize(math_utils.count_to_tf) wf.tf_mat = count_to_tf(wf.count_matrix) # prepare lidf vector: log of inverted df wf.n_docs = float(len(wf.docs)) wf.df_vec = array([2.0, 3.0, 2.0, 2.0, 2.0, 1.0]) wf.idf_vec = wf.n_docs / wf.df_vec wf.log_idf_vec = log(wf.idf_vec) wf.wieghts_mat = matrix(wf.tf_mat * wf.log_idf_vec[:,None]) return wf
def getSimpleDb(self): concepts_index=['c1','c2'] words_index=['a','b','c'] wieght_matrix =matrix( [[0.5, 0.5], [0.2, 0.8], [1.0, 0.0]]) db = DatabaseWrapper( wieght_matrix, concepts_index, words_index, StopWordsStemmer([])) return db
def generate_WM_Stream(): # Specific to DOK matrix sparse = matrix((WM,Stream),dtype=np.bool_) for i in tqdm(range(WM)): connected = np.random.randint(low=0, high=Stream, size=average_stream_per_vox) for j in connected: sparse[i,j] = 1 return sparse
def test_simple(self): # arrange db = self.getSimpleDb() text = "a b c" expected = matrix([1.7/3, 1.3/3]) # act actual = db.get_text_centroid(text) numpy.testing.assert_array_almost_equal(expected.todense(), actual.todense(), err_msg="wrong centroid")
def get_vectors_centroid(list_of_vectors): """ gets a list of scipy vectors with same dimensions and returns their centroid""" n = len(list_of_vectors) if n == 0: return #on 1d vector, shape holds the length shape = list_of_vectors[0].shape ret_vec = matrix(shape) for vector in list_of_vectors: #_log.debug("get_vectors_centroid: Adding vector {}".format(vector)) ret_vec = ret_vec + vector ret_vec = ret_vec * (1.0 / n) return ret_vec
def test_migration(self): """tests that new form of test equals to old one from test__advanced_doc""" expected = matrix([[0.40546511, 1.05803603, 0.], [0., 0., 0.], [0.85091406, 0.85091406, 0.], [0.9675591, 0.85091406, 0.], [0.40546511, 0.6865121, 0.], [0., 0., 2.62161231]]) wf = simple_wf() actual = wf.wieghts_mat assert_allclose(actual.todense(), expected.todense())
def get_word_vector(self, word): """ Row representation of the word in Wiki concepts @returns: the text vector in wikipedia space. """ vector = None if self.index_by_word.has_key(word): index = self.index_by_word[word] vector = self.wieght_matrix[index, :] else: #if word is not in corpus: return empty vector vector = matrix((1,self.concepts_num)) return vector
def get_word_vector(self, word): """ Row representation of the word in Wiki concepts @returns: the text vector in wikipedia space. """ vector = None if self.index_by_word.has_key(word): index = self.index_by_word[word] vector = self.wieght_matrix[index, :] else: #if word is not in corpus: return empty vector vector = matrix((1, self.concepts_num)) return vector
def test_migration(self): """tests that new form of test equals to old one from test__advanced_doc""" expected = matrix([ [ 0.40546511, 1.05803603, 0. ], [ 0. , 0. , 0. ], [ 0.85091406, 0.85091406, 0. ], [ 0.9675591 , 0.85091406, 0. ], [ 0.40546511, 0.6865121 , 0. ], [ 0. , 0. , 2.62161231]]) wf = simple_wf() actual = wf.wieghts_mat assert_allclose(actual.todense(), expected.todense())
def test__advanced_doc(self): """tests that new form of test equals to old one from test__advanced_doc""" expected_wf = simple_wf() builder = DbBuilder(StopWordsStemmer([])) for doc in expected_wf.docs: builder.add_document(doc) actual_wf = WorkFlow() builder.build(wf=actual_wf, normalization=False) #workaround to handle dimensions mismatch expected_wf.df_vec = matrix(expected_wf.df_vec) assert_allclose(actual_wf.df_vec.todense(), expected_wf.df_vec.todense()) assert_allclose(actual_wf.wieghts_mat.todense(), expected_wf.wieghts_mat.todense())
def build(self, wf=None, normalization=True): ''' Builds DatabaseWrapper according to algorithm @param wf: workflow for debug purpuses @returns: DatabaseWrapper ''' _log.info("Start building inverted index") _log.info("Normalization={}".format(normalization)) _log.info("Building word index") #unique enumeration of words (list of words and index is a posiioin of the word in list) self.word_index = build_word_index(self.concepts_list) _log.info("Number of terms={}".format(len(self.word_index))) _log.info("Number of concepts={}".format(len(self.concepts_list))) #word => index in word_index index_by_word = build_index_by_words(self.word_index) # docs per word df_vec = build_df(index_by_word, self.concepts_list) _log.info("DF vector build is DONE") # weight table not normalized T = build_wieght_table_dok(df_vec, index_by_word, self.concepts_list) _log.info("ID-TDF vector build is DONE") if normalization: normalize(T) _log.info("Normalization is DONE") db = DatabaseWrapper(T, self.concepts_list, self.word_index, self.stemmer) _log.info("Database wrapper created") if wf: wf.word_index = self.word_index #workaround to force returned wf to be sparse wf.df_vec = matrix(df_vec) wf.wieghts_mat = T return db