def output(self): if not self.localize_output: localization.disable() terminal.skip_escapes(not sys.stdout.isatty()) terminal.set_stdout_encoding() previous_directory = os.getcwd() os.chdir(self.repo) absolute_path = basedir.get_basedir_git() os.chdir(absolute_path) format.output_header() outputable.output(changes.ChangesOutput(self.hard)) if changes.get(self.hard).get_commits(): outputable.output(blame.BlameOutput(self.hard, self.useweeks)) if self.timeline: outputable.output(timeline.Timeline(changes.get(self.hard), self.useweeks)) if self.include_metrics: outputable.output(metrics.Metrics()) if self.responsibilities: outputable.output(responsibilities.ResponsibilitiesOutput(self.hard, self.useweeks)) outputable.output(filtering.Filtering()) if self.list_file_types: outputable.output(extensions.Extensions()) format.output_footer() os.chdir(previous_directory)
def output(self): if not self.localize_output: localization.disable() terminal.skip_escapes(not sys.stdout.isatty()) terminal.set_stdout_encoding() previous_directory = os.getcwd() os.chdir(self.repo) isbare = subprocess.Popen("git rev-parse --is-bare-repository", shell=True, bufsize=1, stdout=subprocess.PIPE).stdout isbare = isbare.readlines() self.isbare = (isbare[0].decode("utf-8", "replace").strip() == "true") absolute_path = "" if self.isbare: absolute_path = subprocess.Popen("git rev-parse --git-dir", shell=True, bufsize=1, stdout=subprocess.PIPE).stdout else: absolute_path = subprocess.Popen("git rev-parse --show-toplevel", shell=True, bufsize=1, stdout=subprocess.PIPE).stdout absolute_path = absolute_path.readlines() if len(absolute_path) == 0: sys.exit(_("Unable to determine absolute path of git repository.")) os.chdir(absolute_path[0].decode("utf-8", "replace").strip()) format.output_header() outputable.output(changes.ChangesOutput(self.hard)) if changes.get(self.hard).get_commits(): outputable.output(blame.BlameOutput(self.hard)) if self.timeline: outputable.output( timeline.Timeline(changes.get(self.hard), self.useweeks)) if self.include_metrics: outputable.output(metrics.Metrics()) if self.responsibilities: outputable.output( responsibilities.ResponsibilitiesOutput(self.hard)) outputable.output(filtering.Filtering()) if self.list_file_types: outputable.output(extensions.Extensions()) format.output_footer() os.chdir(previous_directory)
def main(): """ Analyzes specified documents. """ options = parse_args() print (options) # we assume there exist three files: # a vocab file (corpus_vocab.dat) # a training file (corpus_train.dat) # a validation file (corpus_test.dat) corpus = options.corpus # vocab file W = len(open(corpus + "_vocab.dat", 'r').readlines()) #print(open(corpus + "_vocab.dat", 'r').readlines()) # validation file validation_filename = corpus + "_test.dat" wikirandom = archived_dataset.Corpus(corpus + "_train.dat") # should be _train.dat # else: # import wikirandom #load a held-out set validation_docs = archived_dataset.loadDocs(validation_filename) algorithmname = options.algorithmname # the second tells us the batch size batchsize = options.batchsize # the third tells us a list of number of threads to run. (each will be run sequentially) numthreads = options.numthreads # number of documents trueD = wikirandom._D if(algorithmname == "hbb"): if options.D == -1: D = trueD # number of documents to know in advance else: D = options.D # #prior for topics (ANDRE: this is now a parameter) # eta = 1. eta = options.eta # The total number of documents #D = 3.3e6 (used to be number in Wikipedia; now an argument) # The number of topics K = options.K alpha = 1./K #* numpy.ones(K) batchsize = options.batchsize if (algorithmname == "hdp_filtering"): alg = filtering.HDPFiltering(W,eta, options.max_iters,options.threshold*1E-6, T = 300, K = 30) if (algorithmname == "ss"): if (numthreads == 1): alg = filtering.Filtering(W, K, alpha, eta, 1, True, 0.1) # note: last two args shouldn't matter else: # NOT REALLY SUPPORTED! alg = parallelfiltering.ParallelFiltering(W, K, alpha, eta, 1, 0.1,True,options.numthreads) if (algorithmname == "filtering"): #maxiters = 15 if (numthreads == 1): alg = filtering.Filtering(W, K, alpha, eta, options.max_iters, options.useHBBBound, options.threshold) else: if (options.async): alg = asynchronous.ParallelFiltering(W, K, alpha, eta, options.max_iters, options.threshold, options.useHBBBound, options.batchsize, options.numthreads) batchsize = batchsize * options.async_batches_per_eval * options.numthreads else: alg = parallelfiltering.ParallelFiltering(W, K, alpha, eta, options.max_iters, options.threshold, options.useHBBBound, options.numthreads, options.batchsize) batchsize = batchsize * options.numthreads if (algorithmname == "hbb"): #default: tau0 = 1024; kappa = 0.7 # paper says: kappa = 0.5; tau0 = 64; S (minibatch size) = 4096 # alg = onlineldavb.OnlineLDA(W, K, D, alpha, 1./K, options.tau0, options.kappa) # the original code for NIPS submission, eta = 1/K alg = onlineldavb.OnlineLDA(W, K, D, alpha, eta, options.tau0, options.kappa) # EP for LDA if (algorithmname == "filtering_ep"): if (numthreads == 1): alg = filtering.FilteringEP(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton) else: if (options.async): alg = asynchronous.ParallelFilteringEP(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.batchsize, options.numthreads) batchsize = batchsize * options.async_batches_per_eval * options.numthreads else: alg = parallelfiltering.ParallelFilteringEP(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.numthreads, options.batchsize) batchsize = batchsize * options.numthreads # Fake EP for LDA (?) -- to be removed eventually since it's worse than true EP if (algorithmname == "filtering_ep2"): if (numthreads == 1): alg = filtering.FilteringEP2(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton) else: if (options.async): alg = asynchronous.ParallelFilteringEP2(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.batchsize, options.numthreads) batchsize = batchsize * options.async_batches_per_eval * options.numthreads else: alg = parallelfiltering.ParallelFilteringEP2(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.numthreads, options.batchsize) batchsize = batchsize * options.numthreads # Specify the minimum number of points to be processed before we run the evaluation code, since evaluation is expensive minNumPtsPerEval = options.minNumPtsPerEval expGrowthEval = options.expGrowthEval if (minNumPtsPerEval <= 0): if (corpus == "nature"): # 351K docs minNumPtsPerEval = 512 #1e3 elif (corpus == "wiki"): # 3.6M docs #minNumPtsPerEval = 512 #1e3 #2e4 minNumPtsPerEval = 2 # for toy wiki dataset else: minNumPtsPerEval = int(trueD / 1000) print ("Using algorithm: " + str(alg)) recordedData = [] totalTime = 0.0 totalDownloadingTime = 0.0 iters = int(trueD / batchsize) + 1 #print(iters, batchsize, trueD) numPtsProc = 0 # number of points processed since last evaluation for iteration in range(iters): # Get some articles start = time.time() docset = wikirandom.get_random_docs(batchsize) totalDownloadingTime += time.time() - start start = time.time() (alg_alpha, alg_lam) = alg.update_lambda(docset) iter_time = time.time() - start totalTime += iter_time numPtsProc += batchsize # we have processed this many more points if (numPtsProc >= minNumPtsPerEval or iteration == iters-1): # evaluate if we have processed enough points, or this is the last iteration numPtsProc = 0 # reset the counter # The following is just the usual evaluation code from before start = time.time() (perplex, split) = evaluation.evaluate(validation_docs, alg_alpha, alg_lam, options.usePtEst) testTime = time.time() - start print (str(iteration+1) + "/" + str(iters) + " " + str(alg) + " (%g, %g): held-out perplexity estimate = %f, %f" % (iter_time, testTime, perplex, split)) recordedData += [((iteration+1)*batchsize, totalTime, totalDownloadingTime, perplex, split)] # also save perplexity now! if (algorithmname in ["hbb", "filtering", "filtering_ep", "filtering_ep2"]): outfile = corpus + "_" + str(alg) + "_" + str(batchsize) + "_eta" + str(eta) # need to distinguish eta now else: outfile = corpus + "_" + algorithmname + "_" + str(options.batchsize) + "_" + str(options.numthreads) + "_eta" + str(eta) numpy.save(outfile, recordedData) if (expGrowthEval): # double the number of points to the next evaluation minNumPtsPerEval = minNumPtsPerEval * 2 else: print (str(iteration+1) + "/" + str(iters) + " " + str(alg) + " (%g)" % (iter_time)) if (iteration == iters-1): # save final lambda matrix if (algorithmname in ["hbb", "filtering", "filtering_ep", "filtering_ep2"]): topics_outfile = "topics_" + corpus + "_" + str(alg) + "_" + str(batchsize) + "_eta" + str(eta) # need to distinguish eta now else: topics_outfile = "topics_" + corpus + "_" + algorithmname + "_" + str(options.batchsize) + "_" + str(options.numthreads) numpy.save(topics_outfile, alg_lam) # asynchronous filtering needs to terminate its workers if (algorithmname == "filtering"): if (numthreads > 1): if (options.async): alg.shutdown() print ("DONE!")