def train_extractor(extractor,filenames,window_size): extractor.train(windowed(filenames,window_size)) extractor.finalise() return extractor.save()
import matplotlib.pyplot as plt from collections import defaultdict def plot_hist(bin_size,bin_list, upper =None): for bins in bin_list: fig = plt.figure() ax = fig.add_subplot(1,1,1) up_bound = upper or max(bins) x = [i for i in range(up_bound+1)] y = [bins[i] for i in range(up_bound+1)] # print x # print y ax.bar(x,y,width=1) plt.show() docs = [' '.join(w[2]) for w,_ in windowed(sys.argv[2:],int(sys.argv[1]))] tokenised_docs = [filter_tokenise(i) for i in docs] num_topics = 3 lda = lda.LDASampler( docs=tokenised_docs, num_topics=num_topics, alpha=0.25, beta=0.25) print 'Sampling...' for _ in range(100): zs = lda.assignments #print zs #print '[%i %i] [%i %i]' % (zs[0][3], zs[1][3], zs[2][3], zs[3][3]) lda.next() print