Beispiel #1
0
	def output(self):
		if not self.localize_output:
			localization.disable()

		terminal.skip_escapes(not sys.stdout.isatty())
		terminal.set_stdout_encoding()
		previous_directory = os.getcwd()

		os.chdir(self.repo)
		absolute_path = basedir.get_basedir_git()
		os.chdir(absolute_path)
		format.output_header()
		outputable.output(changes.ChangesOutput(self.hard))

		if changes.get(self.hard).get_commits():
			outputable.output(blame.BlameOutput(self.hard, self.useweeks))

			if self.timeline:
				outputable.output(timeline.Timeline(changes.get(self.hard), self.useweeks))

			if self.include_metrics:
				outputable.output(metrics.Metrics())

			if self.responsibilities:
				outputable.output(responsibilities.ResponsibilitiesOutput(self.hard, self.useweeks))

			outputable.output(filtering.Filtering())

			if self.list_file_types:
				outputable.output(extensions.Extensions())

		format.output_footer()
		os.chdir(previous_directory)
Beispiel #2
0
    def output(self):
        if not self.localize_output:
            localization.disable()

        terminal.skip_escapes(not sys.stdout.isatty())
        terminal.set_stdout_encoding()
        previous_directory = os.getcwd()

        os.chdir(self.repo)
        isbare = subprocess.Popen("git rev-parse --is-bare-repository",
                                  shell=True,
                                  bufsize=1,
                                  stdout=subprocess.PIPE).stdout
        isbare = isbare.readlines()
        self.isbare = (isbare[0].decode("utf-8", "replace").strip() == "true")
        absolute_path = ""

        if self.isbare:
            absolute_path = subprocess.Popen("git rev-parse --git-dir",
                                             shell=True,
                                             bufsize=1,
                                             stdout=subprocess.PIPE).stdout
        else:
            absolute_path = subprocess.Popen("git rev-parse --show-toplevel",
                                             shell=True,
                                             bufsize=1,
                                             stdout=subprocess.PIPE).stdout

        absolute_path = absolute_path.readlines()
        if len(absolute_path) == 0:
            sys.exit(_("Unable to determine absolute path of git repository."))

        os.chdir(absolute_path[0].decode("utf-8", "replace").strip())
        format.output_header()
        outputable.output(changes.ChangesOutput(self.hard))

        if changes.get(self.hard).get_commits():
            outputable.output(blame.BlameOutput(self.hard))

            if self.timeline:
                outputable.output(
                    timeline.Timeline(changes.get(self.hard), self.useweeks))

            if self.include_metrics:
                outputable.output(metrics.Metrics())

            if self.responsibilities:
                outputable.output(
                    responsibilities.ResponsibilitiesOutput(self.hard))

            outputable.output(filtering.Filtering())

            if self.list_file_types:
                outputable.output(extensions.Extensions())

        format.output_footer()
        os.chdir(previous_directory)
Beispiel #3
0
def main():
    """
    Analyzes specified documents.
    """
    options = parse_args()
    print (options)

    # we assume there exist three files: 
    # a vocab file (corpus_vocab.dat)
    # a training file (corpus_train.dat)
    # a validation file (corpus_test.dat)

    corpus = options.corpus

    # vocab file
    W = len(open(corpus + "_vocab.dat", 'r').readlines())
    #print(open(corpus + "_vocab.dat", 'r').readlines())
    # validation file
    validation_filename = corpus + "_test.dat"

    wikirandom = archived_dataset.Corpus(corpus + "_train.dat") # should be _train.dat
    # else:
    #     import wikirandom

    #load a held-out set
    validation_docs = archived_dataset.loadDocs(validation_filename)
    algorithmname = options.algorithmname

    # the second tells us the batch size
    batchsize = options.batchsize

    # the third tells us a list of number of threads to run. (each will be run sequentially)
    numthreads = options.numthreads

    # number of documents
    trueD = wikirandom._D
    
  
    if(algorithmname == "hbb"):
        if options.D == -1:
            D = trueD # number of documents to know in advance
        else:
            D = options.D


    # #prior for topics (ANDRE: this is now a parameter)
    # eta = 1.
    eta = options.eta
    
    # The total number of documents
    #D = 3.3e6 (used to be number in Wikipedia; now an argument)

    # The number of topics
    K = options.K
    alpha = 1./K #* numpy.ones(K)
    batchsize = options.batchsize
    
    if (algorithmname == "hdp_filtering"):
        alg = filtering.HDPFiltering(W,eta, options.max_iters,options.threshold*1E-6, T = 300, K = 30)

    if (algorithmname == "ss"):
        if (numthreads == 1):
            alg = filtering.Filtering(W, K, alpha, eta, 1, True, 0.1) # note: last two args shouldn't matter
        else:
			# NOT REALLY SUPPORTED!
            alg =  parallelfiltering.ParallelFiltering(W, K, alpha, eta, 1, 0.1,True,options.numthreads)
			
    if (algorithmname == "filtering"):
        #maxiters = 15
        if (numthreads == 1):
            alg = filtering.Filtering(W, K, alpha, eta, options.max_iters, options.useHBBBound, options.threshold)
        else:
            if (options.async):
                alg = asynchronous.ParallelFiltering(W, K, alpha, eta, options.max_iters, options.threshold, options.useHBBBound, options.batchsize, options.numthreads)
 
                batchsize = batchsize * options.async_batches_per_eval * options.numthreads
            else:
                alg =  parallelfiltering.ParallelFiltering(W, K, alpha, eta, options.max_iters, options.threshold, options.useHBBBound, options.numthreads, options.batchsize)
                batchsize = batchsize * options.numthreads

    if (algorithmname == "hbb"):
        #default: tau0 = 1024; kappa = 0.7
        # paper says: kappa = 0.5; tau0 = 64; S (minibatch size) = 4096
        # alg = onlineldavb.OnlineLDA(W, K, D, alpha, 1./K, options.tau0, options.kappa)  # the original code for NIPS submission, eta = 1/K
        alg = onlineldavb.OnlineLDA(W, K, D, alpha, eta, options.tau0, options.kappa)

    # EP for LDA
    if (algorithmname == "filtering_ep"):
        if (numthreads == 1):
            alg = filtering.FilteringEP(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton)
        else:
            if (options.async):
                alg = asynchronous.ParallelFilteringEP(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.batchsize, options.numthreads)
                batchsize = batchsize * options.async_batches_per_eval * options.numthreads
            else:
                alg = parallelfiltering.ParallelFilteringEP(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.numthreads, options.batchsize)
                batchsize = batchsize * options.numthreads

    # Fake EP for LDA (?) -- to be removed eventually since it's worse than true EP
    if (algorithmname == "filtering_ep2"):
        if (numthreads == 1):
            alg = filtering.FilteringEP2(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton)
        else:
            if (options.async):
                alg = asynchronous.ParallelFilteringEP2(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.batchsize, options.numthreads)
                batchsize = batchsize * options.async_batches_per_eval * options.numthreads
            else:
                alg = parallelfiltering.ParallelFilteringEP2(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.numthreads, options.batchsize)
                batchsize = batchsize * options.numthreads
    

    # Specify the minimum number of points to be processed before we run the evaluation code, since evaluation is expensive
    minNumPtsPerEval = options.minNumPtsPerEval
    expGrowthEval = options.expGrowthEval
    if (minNumPtsPerEval <= 0):
        if (corpus == "nature"):  # 351K docs
            minNumPtsPerEval = 512 #1e3
        elif (corpus == "wiki"):  # 3.6M docs
            #minNumPtsPerEval = 512 #1e3 #2e4
            minNumPtsPerEval = 2  # for toy wiki dataset
        else:
            minNumPtsPerEval = int(trueD / 1000)

    print ("Using algorithm: " + str(alg))
    recordedData = []
    totalTime = 0.0
    totalDownloadingTime = 0.0
    iters = int(trueD / batchsize) + 1
    #print(iters, batchsize, trueD)
    numPtsProc = 0  # number of points processed since last evaluation
    for iteration in range(iters):
        # Get some articles
        start = time.time()
        docset = wikirandom.get_random_docs(batchsize)
        totalDownloadingTime += time.time() - start
        start = time.time()
        (alg_alpha, alg_lam) = alg.update_lambda(docset)
        iter_time = time.time() - start
        totalTime += iter_time
        numPtsProc += batchsize  # we have processed this many more points
        if (numPtsProc >= minNumPtsPerEval or iteration == iters-1):  # evaluate if we have processed enough points, or this is the last iteration
            numPtsProc = 0  # reset the counter
            # The following is just the usual evaluation code from before
            start = time.time()
            (perplex, split) = evaluation.evaluate(validation_docs, alg_alpha, alg_lam, options.usePtEst)
            testTime = time.time() - start
            print (str(iteration+1) + "/" + str(iters) + " " + str(alg) + " (%g, %g): held-out perplexity estimate = %f, %f" % (iter_time, testTime, perplex, split))
            recordedData += [((iteration+1)*batchsize, totalTime, totalDownloadingTime, perplex, split)]  # also save perplexity now!
            if (algorithmname in ["hbb", "filtering", "filtering_ep", "filtering_ep2"]):
    	        outfile = corpus + "_" + str(alg) + "_" + str(batchsize) + "_eta" + str(eta)  # need to distinguish eta now
            else:
    	        outfile = corpus + "_" + algorithmname + "_" + str(options.batchsize) + "_" + str(options.numthreads) + "_eta" + str(eta)
            numpy.save(outfile, recordedData)

            if (expGrowthEval):
				# double the number of points to the next evaluation
    	        minNumPtsPerEval = minNumPtsPerEval * 2
        else:
            print (str(iteration+1) + "/" + str(iters) + " " + str(alg) + " (%g)" % (iter_time))

        if (iteration == iters-1):
            # save final lambda matrix
            if (algorithmname in ["hbb", "filtering", "filtering_ep", "filtering_ep2"]):
                topics_outfile = "topics_" + corpus + "_" + str(alg) + "_" + str(batchsize) + "_eta" + str(eta)  # need to distinguish eta now
            else:
                topics_outfile = "topics_" + corpus + "_" + algorithmname + "_" + str(options.batchsize) + "_" + str(options.numthreads)
            numpy.save(topics_outfile, alg_lam)

	# asynchronous filtering needs to terminate its workers
    if (algorithmname == "filtering"):
        if (numthreads > 1):
            if (options.async):
                alg.shutdown()

    print ("DONE!")