def setup(): global small_vec, large_vec small_vec = make_sparse_labeled_tensor(ndim=1) for i in range(5): small_vec[i] = -i large_vec = make_sparse_labeled_tensor(ndim=1) for i in range(100): large_vec[i] = i - 50
def test_detuple_too_big(): make_sparse_labeled_tensor(ndim=2, initial=(((i, i), i) for i in range(30))).extreme_items(detuple=True)
'''This routine imports text, creates a spase matrix from the words then analyzes the associations ''' from csc.divisi import make_sparse_labeled_tensor from csc.nl import get_nl en_nl = get_nl('en') matrix = make_sparse_labeled_tensor(ndim=2) #this is a matrix (num words X num words) words = ['go', 'fly', 'a', 'kite', 'dogma', 'karma', 'fang', 'poodle', 'aussie', 'corgie', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'] for w in words: if en_nl.is_stopword(w): continue else: normalized = en_nl.normalize(w) matrix[normalized, w] += 1 '''Conduct a term frequency–inverse document frequency normalization on the catch (when we're getting material from documents); this keeps larger documents from dominating the analysis by counting factoring in the length of each document as well as the frequency with which words are used within them) ''' mat_normalized = matrix.normalized('tfidf') #this does a term frequency–inverse document frequency (keeps large documents from dominating) svd = mat_normalized.svd(k=10) svd.summarize() pass
def test_detuple_too_big(): make_sparse_labeled_tensor( ndim=2, initial=(((i, i), i) for i in range(30))).extreme_items(detuple=True)