def nnmf_analysis(): if len(sys.argv) == 1: target_year = 1997 else: target_year = int(sys.argv[1]) fname = str(target_year) + '-' + str(target_year+1) + '.xls' X, flabels, PLY, ptr1, ptr2, ptr3 = parse_stats(fname) # pos = {C, F, G} ptr1 = np.array(X[:,0]==1).transpose() ptr2 = np.array(X[:,0]==2).transpose() ptr3 = np.array(X[:,0]==3).transpose() X = normalize(X) # NNMF w, h = nnmf.factorize(X[:,1:], pc=3, iter=100) print np.shape(h) print np.shape(w) print print h[:,0] print print h[:,1] print print h[:,2] print print h[:,3] return
def recommend(article, n, nofeatures, max_iter=50): #Factorize the matrix, make features word_dictionary, article_words, paper_titles = makeFeatures() wordmatrix,wordvec = makematrix(word_dictionary, article_words) plt.plot(sorted(word_dictionary.values())) plt.savefig("WordFrequencies.png") w,h = nnmf.factorize(np.matrix(wordmatrix), nofeatures, max_iter) print "Recommendations for article: " print article print "-----------------------------" tofind = unicode(paper_titles.index(article)) nearest = nn.findnearest(w[tofind,:], w, paper_titles, 1, 0) nn.printnearest(nearest, 5)
def visualizeFactorization(nofeatures=5, max_iter=10): word_dictionary, article_words, paper_titles = makeFeatures() wordmatrix,wordvec = makematrix(word_dictionary, article_words) w,h = nnmf.factorize(np.matrix(wordmatrix), nofeatures, max_iter) toppatterns=[[] for i in range(len(paper_titles))] patternnames=[] pc, wc = np.shape(h) for i in range(pc): slist=[] # Create a list of words and their weights for j in range(wc): slist.append((h[i,j],wordvec[j])) # Reverse sort the word list slist.sort() slist.reverse() # Print the first six elements n=[s[1] for s in slist[0:10]] print "----------------------------------------" print "Features/Topic Found" print str(i) + str(n) print "Top 3 Articles Closest Matching Feature/Topic" flist=[] for j in range(len(paper_titles)): # Add the article with its weight flist.append((w[j,i],paper_titles[j])) toppatterns[j].append((w[j,i],i,paper_titles[j])) # Reverse sort the list flist.sort() flist.reverse() # Show the top 3 art for f in flist[0:5]: print f
print rows print # Extract the volume field from every line prices[t]=[float(r.split(',')[5]) for r in rows[1:] if r.strip()!=''] if len(prices[t])<shortest: shortest=len(prices[t]) if not dates: dates=[r.split(',')[0] for r in rows[1:] if r.strip()!=''] l1=[[prices[tickers[i]][j] for i in range(len(tickers))] for j in range(shortest)] w,h=nnmf.factorize(matrix(l1),pc=5) print "Printing h" print print h print "Printing w" print w print "Exiting" exit() # Loop over all the features for i in range(shape(h)[0]): print "Feature %d" %i
def showarticles(titles, toppatterns, patternnames, out='articles.txt'): outfile = file(out, 'w') # Loop over all the articles for j in range(len(titles)): outfile.write(titles[j].encode('utf8') + '\n') # Get the top features for this article and # reverse sort them toppatterns[j].sort() toppatterns[j].reverse() # Print the top three patterns for i in range(3): outfile.write( str(toppatterns[j][i][0]) + ' ' + str(patternnames[toppatterns[j][i][1]]) + '\n') outfile.write('\n') outfile.close() #allwords,articlewords,articletitles = getarticlewords() #makematrix(allw=allwords,articlew=articletitles) m1 = matrix([[1, 2, 3], [4, 5, 6]]) m2 = matrix([[1, 2], [3, 4], [5, 6]]) print m1 * m2 w, h = nnmf.factorize(m1 * m2, pc=3, iter=100)