Example #1
0
            priors[i] += data[i] * weightForMean
        dataObj.appendRow(data, 1)

    if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

dataLoadTime = time.time()
logging.debug("loaded %s records into memory" % idx)
logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

for row in dataObj.U:
    if len(row) == 0 and not hasHyperprior:
        # TODO(max): write up a paper describing the hyperprior and link it.
        raise Exception(
            "You can't have any columns with all 0s, unless you provide a hyperprior (-H)"
        )

initPriorWeight = 1
priorSum = sum(priors) + 0.01  # Nudge to prevent zero
for i in range(0, K):
    priors[i] /= priorSum
    priors[i] += 0.01  # Nudge to prevent zero

priors = DME.findDirichletPriors(dataObj, priors, iterations, Beta, W)

print "Final priors: ", priors
logging.debug("Final average loss: %s" %
              DME.getTotalLoss(priors, dataObj, Beta, W))

totalTime = time.time() - dataLoadTime
logging.debug("Time to calculate: %s" % totalTime)
Example #2
0
		
		sumData = sum(data)
		weightForMean = 1.0 / (1.0 + sumData)
		for i in range(0, K): priors[i] += data[i] * weightForMean
		dataObj.appendRow(data, 1)

	if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

dataLoadTime = time.time()
logging.debug("loaded %s records into memory" % idx)
logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

for row in dataObj.U:
	if len(row) == 0 and not hasHyperprior:
		# TODO(max): write up a paper describing the hyperprior and link it.
		raise Exception("You can't have any columns with all 0s, unless you provide a hyperprior (-H)")

initPriorWeight = 1
priorSum = sum(priors) + 0.01 # Nudge to prevent zero
for i in range(0, K):
  priors[i] /= priorSum
  priors[i] += 0.01 # Nudge to prevent zero

priors = DME.findDirichletPriors(dataObj, priors, iterations, Beta, W)	

print "Final priors: ", priors
logging.debug("Final average loss: %s" % DME.getTotalLoss(priors, dataObj, Beta, W))

totalTime = time.time() - dataLoadTime
logging.debug("Time to calculate: %s" % totalTime)
Example #3
0
def main(K, iterations, H, input_stream, sampleRate, M):
	startTime = time.time()
	logging.debug("K = " + str(K))
	logging.debug("iterations = " + str(iterations))
	logging.debug("H = " + str(H))
	logging.debug("sampleRate = " + str(sampleRate))
	logging.debug("M = " + str(M))

	# TODO(max): write up a paper describing the hyperprior and link it.
	W = 0
	Beta = [0]*K
	Hstr = H.split(",")
	hasHyperprior = False
	if (len(Hstr) == K + 1):
		for i in range(0, K): Beta[i] = float(Hstr[i])
		W = float(Hstr[K])
		hasHyperprior = True
	else:
		Beta = None
		W = None

	logging.debug("Beta = " + str(Beta))
	logging.debug("W = " + str(W))
	
	#####
	# Load Data
	#####
	csv.field_size_limit(1000000000)
	reader = csv.reader(input_stream, delimiter='\t')
	logging.debug("Loading data")
	priors = [0.]*K

	dataObj = DME.CompressedRowData(K)

	idx = 0
	for row in reader:
		idx += 1

		if (random.random() < float(sampleRate)):
			data = map(int, row)
			if (len(data) != K):
				logging.error("There are %s categories, but line has %s counts." % (K, len(data)))
				logging.error("line %s: %s" % (i, data))
			
			
			while sum(data) > M: data[Sample.drawCategory(data)] -= 1
			
			sumData = sum(data)
			weightForMean = 1.0 / (1.0 + sumData)
			for i in range(0, K): priors[i] += data[i] * weightForMean
			dataObj.appendRow(data, 1)

		if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

	dataLoadTime = time.time()
	logging.debug("loaded %s records into memory" % idx)
	logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

	for row in dataObj.U:
		if len(row) == 0 and not hasHyperprior:
			# TODO(max): write up a paper describing the hyperprior and link it.
			raise Exception("You can't have any columns with all 0s, unless you provide a hyperprior (-H)")

	priorSum = sum(priors) + 0.01 # Nudge to prevent zero
	for i in range(0, K):
	  priors[i] /= priorSum
	  priors[i] += 0.01 # Nudge to prevent zero

	priors = DME.findDirichletPriors(dataObj, priors, iterations, Beta, W)	

	# print "Final priors: ", priors
	logging.debug("Final average loss: %s" % DME.getTotalLoss(priors, dataObj, Beta, W))

	totalTime = time.time() - dataLoadTime
	logging.debug("Time to calculate: %s" % totalTime)
	return priors
		sumData = sum(data)
		weightForMean = 1.0 / (1.0 + sumData)
		for i in range(0, K): priors[i] += data[i] * weightForMean
		dataObj.appendRow(data, 1)

	if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

dataLoadTime = time.time()
logging.debug("loaded %s records into memory" % idx)
logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

# TODO(max): Figure out what to do with the all-zero column
#for row in dataObj.U:
#  if len(row) == 0 and not hasHyperprior:
#    # TODO(max): write up a paper describing the hyperprior and link it.
#    raise Exception("You can't have any columns with all 0s, unless you provide a hyperprior (-H)")

initPriorWeight = 1
priorSum = sum(priors) + 0.01 # Nudge to prevent zero
for i in range(0, K):
  priors[i] /= priorSum
  priors[i] += 0.01 # Nudge to prevent zero

priors = DME.findDirichletPriors(dataObj, priors, iterations, hyperprior)	

print "Final priors: ", priors
logging.debug("Final average loss: %s" % DME.getTotalLoss(priors, dataObj, hyperprior))

totalTime = time.time() - dataLoadTime
logging.debug("Time to calculate: %s" % totalTime)
Example #5
0
        for i in range(0, K):
            priors[i] += data[i] * weightForMean
        dataObj.appendRow(data, 1)

    if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

dataLoadTime = time.time()
logging.debug("loaded %s records into memory" % idx)
logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

# TODO(max): Figure out what to do with the all-zero column
#for row in dataObj.U:
#  if len(row) == 0 and not hasHyperprior:
#    # TODO(max): write up a paper describing the hyperprior and link it.
#    raise Exception("You can't have any columns with all 0s, unless you provide a hyperprior (-H)")

initPriorWeight = 1
priorSum = sum(priors) + 0.01  # Nudge to prevent zero
for i in range(0, K):
    priors[i] /= priorSum
    priors[i] += 0.01  # Nudge to prevent zero

priors = DME.findDirichletPriors(dataObj, priors, iterations, hyperprior)

print "Final priors: ", priors
logging.debug("Final average loss: %s" %
              DME.getTotalLoss(priors, dataObj, hyperprior))

totalTime = time.time() - dataLoadTime
logging.debug("Time to calculate: %s" % totalTime)
Example #6
0
		weightForMean = 1.0 / (1.0 + sumData)
		for i in range(0, K): 
			priors[i] += data[i] * weightForMean
			uVector = uMatrix[i]
			for j in range(0, data[i]):
				if (len(uVector) == j): uVector.append(0)
				uVector[j] += 1
			
		for j in range(0, sumData):
			if (len(vVector) == j): vVector.append(0)
			vVector[j] += 1

	if (i % 1000000) == 0: print "Loading Data", i

dataLoadTime = time.time()
print "all data loaded into memory"
print "time to load memory: ", dataLoadTime - startTime

initPriorWeight = 1
priorSum = sum(priors)
for i in range(0, K): priors[i] /= priorSum

verbose = options.V == "True"
priors = DME.findDirichletPriors(uMatrix, vVector, priors, verbose)	
print "Final priors: ", priors
print "Final average loss:", DME.getTotalLoss(priors, uMatrix, vVector)

totalTime = time.time() - dataLoadTime
print "Time to calculate: " + str(totalTime)