def mkrundir(self,rundir): # sort out the rundir if not rundir: # ok so we will generate one if not rundir_root: rundir_root=RUNDIRS_ROOT rundir = rungen.mk_next_rundir(rundir_root) # ok so we have a rundir now assert os.path.exists(rundir), "rundir doesn't exist!" self.rundir = rundir # tag it onto the LdaModel while we are at it
def mkrundir(self, rundir): # sort out the rundir if not rundir: # ok so we will generate one if not rundir_root: rundir_root = RUNDIRS_ROOT rundir = rungen.mk_next_rundir(rundir_root) # ok so we have a rundir now assert os.path.exists(rundir), "rundir doesn't exist!" self.rundir = rundir # tag it onto the LdaModel while we are at it
def run(args): """ The command-line run script for LDA experiments. """ # scientific import numpy as np import scipy as sp # display what run got in args for tup in args.__dict__.iteritems(): print tup # LOAD VOCAB wlist = smart_list_reader( args.vocab_file ) if not wlist: print "Vocab format not recognized" sys.exit(-1) # convert from list [term1, term2, ...] to dicts # [term1:0, term2:1, ... ] and the inverse mapping id2word = dict( enumerate(wlist) ) word2id = dict( [(word,id) for id,word in id2word.items()] ) vocab = word2id # SETUP CORPUS (LAZY) # doCounts -- not so lazy... if args.docs_file[-3:]==".mm": from liblda.newmmcorpus import NewMmCorpus corpus = NewMmCorpus(args.docs_file) corpus.setVocabFromDict( vocab ) corpus.doCounts() elif args.docs_file[-4:]==".txt": from liblda.low2corpus import Low2Corpus corpus = Low2Corpus(args.docs_file) corpus.setVocabFromDict( vocab ) corpus.doCounts() else: print "Corpus format not recognized" sys.exit(-1) # Create rundir from socket import gethostname from liblda.util import rungen full_hostname = gethostname() host_id = full_hostname.rstrip(".cs.mcgill.ca") if not args.rundirs_root: rundirs_root = RUNDIRS_ROOT else: rundirs_root = args.rundirs_root if not os.path.exists(rundirs_root): print "Error, rundirs_root %s doesn't exist" % rundirs_root sys.exit(-1) # create the host-specific rundir if necessary host_rundirs_root = os.path.join(rundirs_root, host_id) if not os.path.exists(host_rundirs_root): os.mkdir( host_rundirs_root ) # create a new (sequential) rundir for this host rundir = rungen.mk_next_rundir(host_rundirs_root) logger.info("rundir: " + rundir ) # prepare a dict which will become input.json input = {} input["rundir"]=rundir input["numT"]=args.numT input["iter"]=args.iter input["corpus"]=args.docs_file input["vocab"]=args.vocab_file input["alpha"]=args.alpha input["beta"]= args.beta input["seed"]=args.seed input["host_id"]=host_id # and write it to disk f=open( os.path.join(rundir, "input.json"), "w" ) simplejson.dump( input, f, indent=0 ) f.close() start_time = datetime.datetime.now() # setup the lda model lda = LdaModel( numT=args.numT, alpha=args.alpha, beta=args.beta, corpus=corpus, vocab=vocab ) # if not in seeded mode run as usual if not args.seed_z_from: lda.train(iter=args.iter, seed=args.seed ) # NEW: S else: logger.info("Using seeded z training ... ") # training params if not args.iter: lda.iter = 50 else: lda.iter = args.iter if not args.seed: seed = 777 lda.seed = 2*seed+1 else: lda.seed = 2*args.seed + 1 # loadup the seed_z_from file into seed_z np array seed_z = np.load( args.seed_z_from) if args.expand_factors: expand_factors_str = smart_list_reader( args.expand_factors ) expand_factors = np.array( [int(i) for i in expand_factors_str ] ) else: expand_factors = None # let lda.seeded_initialize() handle it # custom train sequence lda.allocate_arrays() lda.read_dw_alphabetical() #self.random_initialize() # NO -- we want a seeded initialization! lda.seeded_initialize(seed_z, expand_factors ) lda.gibbs_sample(iter=lda.iter, seed=lda.seed ) lda.wpdt_to_probs() #self.deallocate_arrays() # record how long it took end_time = datetime.datetime.now() duration = (end_time-start_time).seconds # save word counts and topic assignment counts (these are sparse) if args.save_counts: # TRUE by default state = ["dp", "wp", "alpha", "beta" ] for var_name in state: f_name = os.path.join(rundir, RUN_FILENAMESS[var_name] ) np.save( f_name, lda.__getattribute__(var_name) ) logger.info("Done writing out Nwt+beta, Ndt+alpha") # Gibbs sampler state, which consists of # the full topic assignments "z.npy" if args.save_z: var_name="z" f_name = os.path.join(rundir, RUN_FILENAMESS[var_name] ) np.save( f_name, lda.__getattribute__(var_name) ) logger.info("Done writing out z.npy") # save probs if args.save_probs: probs = ["phi", "theta"] for var_name in probs: f_name = os.path.join(rundir, RUN_FILENAMESS[var_name] ) np.save( f_name, lda.__getattribute__(var_name) ) logger.info("Done writing out probabilities phi.npy and theta.npy") # prepare a dict which will become output.json output = {} # run details output["rundir"]=rundir output["host_id"]=host_id output["iter"]=lda.iter output["seed"]=lda.seed output["start_time"]=start_time.isoformat() # ISO format string # to read ISO time stamps use dateutil #>>> from dateutil import parser #>>> parser.parse("2011-01-25T23:36:43.373248") # datetime.datetime(2011, 1, 25, 23, 36, 43, 373247) output["duration"]=int(duration) # corpus info output["corpus"]=args.docs_file output["vocab"]=args.vocab_file output["numDocs"] = lda.numDocs output["numTerms"] = lda.numTerms output["totalNterms"] = lda.corpus.totalNwords # model parameters output["numT"]=lda.numT # the hyperparameters are too long to store in full here, # use separate .npy files if alpha/beta non uniform output["alpha"]= lda.alpha[0] #[np.average(lda.alpha), float(np.cov(lda.alpha)) ] # [avg, var] output["beta"]= lda.beta[0] #[np.average(lda.beta), float(np.cov(lda.beta)) ] # [avg, var] # # calculate likelyhood output["loglike"]=lda.loglike() output["perplexity"]=lda.perplexity() # = np.exp( -1 * loglike() / totalNwords ) logger.info("Log likelyhood: %f" % output["loglike"] ) logger.info("Perplexity: %f" % output["perplexity"] ) # # special seeding info if args.seed_z_from: output["seed_z_from"]= args.seed_z_from if args.expand_factors: output["expand_factors"]= args.expand_factors # compute sparseness and write it out sp = get_sparse_stats( lda.phi ) np.save( os.path.join(rundir, "phi_sparseness.npy"), sp) # report on sparseness statisitcs (assume single mode) nz = sp.nonzero()[0] # get the nonzero entries sp_avg = sum([sp[i]*i for i in nz]) # where are non-zero they concentrated ? sp_var = sum( [sp[i]*np.abs(i-sp_avg)**2 for i in nz] ) sp_stdev = np.sqrt( sp_var ) # how concentrated they are around sp_avg output["phi_sparseness_avg"]=sp_avg output["phi_sparseness_stdev"]=sp_stdev logger.info("Phi sparseness. center=%d, width=%d" % (int(sp_avg),int(sp_stdev)) ) # same for theta sp = get_sparse_stats( lda.theta ) np.save( os.path.join(rundir, "theta_sparseness.npy"), sp) # report on sparseness statisitcs (assume single mode) nz = sp.nonzero()[0] # get the nonzero entries sp_avg = sum([sp[i]*i for i in nz]) # where are non-zero they concentrated ? sp_var = sum( [sp[i]*np.abs(i-sp_avg)**2 for i in nz] ) sp_stdev = np.sqrt( sp_var ) # how concentrated they are around sp_avg output["theta_sparseness_avg"]=sp_avg output["theta_sparseness_stdev"]=sp_stdev logger.info("Theta sparseness. center=%d, width=%d" % (int(sp_avg),int(sp_stdev)) ) # write all output data to disk f=open( os.path.join(rundir, "output.json"), "w" ) simplejson.dump( output, f, indent=0 ) f.close() logger.info("Done saving output.json") if args.print_topics: from liblda.topicviz.show_top import show_top top_words_in_topics = show_top(lda.phi, num=args.print_topics, id2word=lda.corpus.id2word) for topic in top_words_in_topics: words = ", ".join(topic) print words logger.info("Done! --> thank you come again")
def run(args): """ The command-line run script for LDA experiments. """ # scientific import numpy as np import scipy as sp # display what run got in args for tup in args.__dict__.iteritems(): print tup # LOAD VOCAB wlist = smart_list_reader(args.vocab_file) if not wlist: print "Vocab format not recognized" sys.exit(-1) # convert from list [term1, term2, ...] to dicts # [term1:0, term2:1, ... ] and the inverse mapping id2word = dict(enumerate(wlist)) word2id = dict([(word, id) for id, word in id2word.items()]) vocab = word2id # SETUP CORPUS (LAZY) # doCounts -- not so lazy... if args.docs_file[-3:] == ".mm": from liblda.newmmcorpus import NewMmCorpus corpus = NewMmCorpus(args.docs_file) corpus.setVocabFromDict(vocab) corpus.doCounts() elif args.docs_file[-4:] == ".txt": from liblda.low2corpus import Low2Corpus corpus = Low2Corpus(args.docs_file) corpus.setVocabFromDict(vocab) corpus.doCounts() else: print "Corpus format not recognized" sys.exit(-1) # Create rundir from socket import gethostname from liblda.util import rungen full_hostname = gethostname() host_id = full_hostname.rstrip(".cs.mcgill.ca") if not args.rundirs_root: rundirs_root = RUNDIRS_ROOT else: rundirs_root = args.rundirs_root if not os.path.exists(rundirs_root): print "Error, rundirs_root %s doesn't exist" % rundirs_root sys.exit(-1) # create the host-specific rundir if necessary host_rundirs_root = os.path.join(rundirs_root, host_id) if not os.path.exists(host_rundirs_root): os.mkdir(host_rundirs_root) # create a new (sequential) rundir for this host rundir = rungen.mk_next_rundir(host_rundirs_root) logger.info("rundir: " + rundir) # prepare a dict which will become input.json input = {} input["rundir"] = rundir input["numT"] = args.numT input["iter"] = args.iter input["corpus"] = args.docs_file input["vocab"] = args.vocab_file input["alpha"] = args.alpha input["beta"] = args.beta input["seed"] = args.seed input["host_id"] = host_id # and write it to disk f = open(os.path.join(rundir, "input.json"), "w") simplejson.dump(input, f, indent=0) f.close() start_time = datetime.datetime.now() # setup the lda model lda = LdaModel(numT=args.numT, alpha=args.alpha, beta=args.beta, corpus=corpus, vocab=vocab) # if not in seeded mode run as usual if not args.seed_z_from: if not args.save_perplexity_every: lda.train(iter=args.iter, seed=args.seed) else: lda.allocate_arrays() lda.read_dw_alphabetical() lda.random_initialize() cum = 0 perp_hist = [] while (cum < args.iter): lda.gibbs_sample(iter=args.save_perplexity_every, seed=args.seed + cum) lda.wpdt_to_probs() perp_hist.append(lda.perplexity() ) # = np.exp( -1 * loglike() / totalNwords ) cum += args.save_perplexity_every # NEW: S else: logger.info("Using seeded z training ... ") # training params if not args.iter: lda.iter = 50 else: lda.iter = args.iter if not args.seed: seed = 777 lda.seed = 2 * seed + 1 else: lda.seed = 2 * args.seed + 1 # loadup the seed_z_from file into seed_z np array seed_z = np.load(args.seed_z_from) if args.expand_factors: expand_factors_str = smart_list_reader(args.expand_factors) expand_factors = np.array([int(i) for i in expand_factors_str]) else: expand_factors = None # let lda.seeded_initialize() handle it # custom train sequence lda.allocate_arrays() lda.read_dw_alphabetical() #self.random_initialize() # NO -- we want a seeded initialization! lda.seeded_initialize(seed_z, expand_factors) lda.gibbs_sample(iter=lda.iter, seed=lda.seed) lda.wpdt_to_probs() #self.deallocate_arrays() # record how long it took end_time = datetime.datetime.now() duration = (end_time - start_time).seconds # save word counts and topic assignment counts (these are sparse) if args.save_counts: # TRUE by default state = ["dp", "wp", "alpha", "beta"] for var_name in state: f_name = os.path.join(rundir, RUN_FILENAMESS[var_name]) np.save(f_name, lda.__getattribute__(var_name)) logger.info("Done writing out Nwt+beta, Ndt+alpha") # Gibbs sampler state, which consists of # the full topic assignments "z.npy" if args.save_z: var_name = "z" f_name = os.path.join(rundir, RUN_FILENAMESS[var_name]) np.save(f_name, lda.__getattribute__(var_name)) logger.info("Done writing out z.npy") # save probs if args.save_probs: probs = ["phi", "theta"] for var_name in probs: f_name = os.path.join(rundir, RUN_FILENAMESS[var_name]) np.save(f_name, lda.__getattribute__(var_name)) logger.info("Done writing out probabilities phi.npy and theta.npy") # prepare a dict which will become output.json output = {} # run details output["rundir"] = rundir output["host_id"] = host_id output["iter"] = args.iter output["seed"] = args.seed output["start_time"] = start_time.isoformat() # ISO format string # to read ISO time stamps use dateutil #>>> from dateutil import parser #>>> parser.parse("2011-01-25T23:36:43.373248") # datetime.datetime(2011, 1, 25, 23, 36, 43, 373247) output["duration"] = int(duration) # corpus info output["corpus"] = args.docs_file output["vocab"] = args.vocab_file output["numDocs"] = lda.numDocs output["numTerms"] = lda.numTerms output["totalNterms"] = lda.corpus.totalNwords # model parameters output["numT"] = lda.numT # the hyperparameters are too long to store in full here, # use separate .npy files if alpha/beta non uniform output["alpha"] = lda.alpha[ 0] #[np.average(lda.alpha), float(np.cov(lda.alpha)) ] # [avg, var] output["beta"] = lda.beta[ 0] #[np.average(lda.beta), float(np.cov(lda.beta)) ] # [avg, var] # # calculate likelyhood output["loglike"] = lda.loglike() output["perplexity"] = lda.perplexity( ) # = np.exp( -1 * loglike() / totalNwords ) if args.save_perplexity_every: output["perplexity_history"] = perp_hist logger.info("Log likelyhood: %f" % output["loglike"]) logger.info("Perplexity: %f" % output["perplexity"]) # # special seeding info if args.seed_z_from: output["seed_z_from"] = args.seed_z_from if args.expand_factors: output["expand_factors"] = args.expand_factors # compute sparseness and write it out sp = get_sparse_stats(lda.phi) np.save(os.path.join(rundir, "phi_sparseness.npy"), sp) # report on sparseness statisitcs (assume single mode) nz = sp.nonzero()[0] # get the nonzero entries sp_avg = sum([sp[i] * i for i in nz]) # where are non-zero they concentrated ? sp_var = sum([sp[i] * np.abs(i - sp_avg)**2 for i in nz]) sp_stdev = np.sqrt(sp_var) # how concentrated they are around sp_avg output["phi_sparseness_avg"] = sp_avg output["phi_sparseness_stdev"] = sp_stdev logger.info("Phi sparseness. center=%d, width=%d" % (int(sp_avg), int(sp_stdev))) # same for theta sp = get_sparse_stats(lda.theta) np.save(os.path.join(rundir, "theta_sparseness.npy"), sp) # report on sparseness statisitcs (assume single mode) nz = sp.nonzero()[0] # get the nonzero entries sp_avg = sum([sp[i] * i for i in nz]) # where are non-zero they concentrated ? sp_var = sum([sp[i] * np.abs(i - sp_avg)**2 for i in nz]) sp_stdev = np.sqrt(sp_var) # how concentrated they are around sp_avg output["theta_sparseness_avg"] = sp_avg output["theta_sparseness_stdev"] = sp_stdev logger.info("Theta sparseness. center=%d, width=%d" % (int(sp_avg), int(sp_stdev))) # write all output data to disk f = open(os.path.join(rundir, "output.json"), "w") simplejson.dump(output, f, indent=0) f.close() logger.info("Done saving output.json") if args.print_topics: from liblda.topicviz.show_top import show_top top_words_in_topics = show_top(lda.phi, num=args.print_topics, id2word=lda.corpus.id2word) for topic in top_words_in_topics: words = ", ".join(topic) print words logger.info("Done! --> thank you come again")