def sampleRow(self, amount):
   category = ST.drawCategory(self.mixture)
   multinomial = self.multinomials[category]
   retVal = [0]*self.K
   for i in range(0, amount):
     k = ST.drawCategory(multinomial)
     retVal[k] += 1
   return retVal
 def sampleRow(self, amount):
     category = ST.drawCategory(self.mixture)
     multinomial = self.multinomials[category]
     retVal = [0] * self.K
     for i in range(0, amount):
         k = ST.drawCategory(multinomial)
         retVal[k] += 1
     return retVal
 def sampleRow(self, amount):
   category = ST.drawCategory(self.mixture)
   dirichlet = self.dirichlets[category]
   multinomial = ST.drawFromDirichlet(dirichlet)
   retVal = [0]*self.K
   for i in range(0, amount):
     k = ST.drawCategory(multinomial)
     retVal[k] += 1
   return retVal
 def sampleRow(self, amount):
     category = ST.drawCategory(self.mixture)
     dirichlet = self.dirichlets[category]
     multinomial = ST.drawFromDirichlet(dirichlet)
     retVal = [0] * self.K
     for i in range(0, amount):
         k = ST.drawCategory(multinomial)
         retVal[k] += 1
     return retVal
Example #5
0
  def sampleRow(self, amount):
    c = ST.drawCategory(self.multinomialMixture.mixture)
    if (self.mixtureNodes[c]): return self.mixtureNodes[c].sampleRow(amount)

    multinomial = self.multinomialMixture.multinomials[category]
    retVal = [0]*self.K
    for i in range(0, amount):
      k = ST.drawCategory(multinomial)
      retVal[k] += 1
    return retVal
Example #6
0
dataObj = DME.CompressedRowData(K)

idx = 0
for row in reader:
    idx += 1

    if (random.random() < float(options.sampleRate)):
        data = map(int, row)
        if (len(data) != K):
            logging.error("There are %s categories, but line has %s counts." %
                          (K, len(data)))
            logging.error("line %s: %s" % (i, data))

        while sum(data) > options.M:
            data[Sample.drawCategory(data)] -= 1

        sumData = sum(data)
        weightForMean = 1.0 / (1.0 + sumData)
        for i in range(0, K):
            priors[i] += data[i] * weightForMean
        dataObj.appendRow(data, 1)

    if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

dataLoadTime = time.time()
logging.debug("loaded %s records into memory" % idx)
logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

for row in dataObj.U:
    if len(row) == 0 and not hasHyperprior:
Example #7
0
priors = [0.]*K

dataObj = DME.CompressedRowData(K)

idx = 0
for row in reader:
	idx += 1

	if (random.random() < float(options.sampleRate)):
		data = map(int, row)
		if (len(data) != K):
			logging.error("There are %s categories, but line has %s counts." % (K, len(data)))
			logging.error("line %s: %s" % (idx, data))
		
		
		while sum(data) > options.M: data[Sample.drawCategory(data)] -= 1
		
		sumData = sum(data)
		weightForMean = 1.0 / (1.0 + sumData)
		for i in range(0, K): priors[i] += data[i] * weightForMean
		dataObj.appendRow(data, 1)

	if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

dataLoadTime = time.time()
logging.debug("loaded %s records into memory" % idx)
logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

for row in dataObj.U:
	if len(row) == 0 and not hasHyperprior:
		# TODO(max): write up a paper describing the hyperprior and link it.
Example #8
0
def main(K, iterations, H, input_stream, sampleRate, M):
	startTime = time.time()
	logging.debug("K = " + str(K))
	logging.debug("iterations = " + str(iterations))
	logging.debug("H = " + str(H))
	logging.debug("sampleRate = " + str(sampleRate))
	logging.debug("M = " + str(M))

	# TODO(max): write up a paper describing the hyperprior and link it.
	W = 0
	Beta = [0]*K
	Hstr = H.split(",")
	hasHyperprior = False
	if (len(Hstr) == K + 1):
		for i in range(0, K): Beta[i] = float(Hstr[i])
		W = float(Hstr[K])
		hasHyperprior = True
	else:
		Beta = None
		W = None

	logging.debug("Beta = " + str(Beta))
	logging.debug("W = " + str(W))
	
	#####
	# Load Data
	#####
	csv.field_size_limit(1000000000)
	reader = csv.reader(input_stream, delimiter='\t')
	logging.debug("Loading data")
	priors = [0.]*K

	dataObj = DME.CompressedRowData(K)

	idx = 0
	for row in reader:
		idx += 1

		if (random.random() < float(sampleRate)):
			data = map(int, row)
			if (len(data) != K):
				logging.error("There are %s categories, but line has %s counts." % (K, len(data)))
				logging.error("line %s: %s" % (i, data))
			
			
			while sum(data) > M: data[Sample.drawCategory(data)] -= 1
			
			sumData = sum(data)
			weightForMean = 1.0 / (1.0 + sumData)
			for i in range(0, K): priors[i] += data[i] * weightForMean
			dataObj.appendRow(data, 1)

		if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

	dataLoadTime = time.time()
	logging.debug("loaded %s records into memory" % idx)
	logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

	for row in dataObj.U:
		if len(row) == 0 and not hasHyperprior:
			# TODO(max): write up a paper describing the hyperprior and link it.
			raise Exception("You can't have any columns with all 0s, unless you provide a hyperprior (-H)")

	priorSum = sum(priors) + 0.01 # Nudge to prevent zero
	for i in range(0, K):
	  priors[i] /= priorSum
	  priors[i] += 0.01 # Nudge to prevent zero

	priors = DME.findDirichletPriors(dataObj, priors, iterations, Beta, W)	

	# print "Final priors: ", priors
	logging.debug("Final average loss: %s" % DME.getTotalLoss(priors, dataObj, Beta, W))

	totalTime = time.time() - dataLoadTime
	logging.debug("Time to calculate: %s" % totalTime)
	return priors
Example #9
0
csv.field_size_limit(1000000000)
reader = csv.reader(sys.stdin, delimiter='\t')
logging.debug("Loading data")
dataObj = []

idx = 0
for row in reader:
  idx += 1
  
  if (random.random() < float(options.sampleRate)):
    data = map(int, row)
    if (len(data) != K):
      logging.error("There are %s categories, but line has %s counts." % (K, len(data)))
      logging.error("line %s: %s" % (i, data))
    
    while sum(data) > options.M: data[Sample.drawCategory(data)] -= 1
    dataObj.append(data)
  
  if (idx % 1000000) == 0: logging.debug("Loading Data: %s rows done" % idx)

dataLoadTime = time.time()
logging.debug("loaded %s records into memory" % idx)
logging.debug("time to load memory: %s " % (dataLoadTime - startTime))

# TODO(max): enforce this
#for row in dataObj:
#	if len(row) == 0 and not hasHyperprior:
#		# TODO(max): write up a paper describing the hyperprior and link it.
#		raise Exception("You can't have any columns with all 0s, unless you provide a hyperprior (-H)")

# Mixture hyperparams (the mixture itself has a dirichlet prior)