def second_mapper(data): n = gopts.getintkey('ncols') m = int(os.getenv('nrows')) maxlocal = int(os.getenv('maxlocal')) totalrows = 0 totalouts = 0 rows = [] util.setstatus('acquiring data with ncols=%i'%(n)) for key,value in data: assert(len(value) == n) rows.append(value) totalrows += 1 if len(rows) >= maxlocal: dumbo.util.incrcounter('Program','rows acquired',len(rows)) totalouts += 1 for row in localQoutput(rows): key = random.randint(0, 4000000000) yield key, row # reset rows, status rows = [] util.setstatus('acquiring data with ncols=%i'%(n)) if len(rows) > 0: for row in localQoutput(rows): key = random.randint(0, 4000000000) yield key, row
def first_mapper(data): """ This mapper doesn't take any input, and generates the R factor. """ hostname = os.uname()[1] print >>sys.stderr, hostname, "is a mapper" # suck up all the data so Hadoop doesn't complain for key,val in data: pass n = gopts.getintkey('ncols') m = int(os.getenv('nrows')) k = int(os.getenv('maprows'))/n s = float(m)/float(n) util.setstatus( "generating %i-by-%i R matrix with scale factor %i/%i=%s"%( n, n, m, n, s)) R = numpy.triu(numpy.ones((n,n)))/math.sqrt(s) for i in xrange(k): util.setstatus( 'step %i/%i: generating local %i-by-%i Q matrix'%(i+1,k,n,n)) Q = numpy.linalg.qr(numpy.random.randn(n,n))[0] # just the Q factor util.setstatus('step %i/%i: multiplying local matrix'%(i+1,k)) A = Q.dot(R) util.setstatus('step %i/%i: outputting %i rows'%(i+1,k,A.shape[0])) for row in A: key = random.randint(0, 4000000000) yield key, util.array2list(row)
def localQoutput(rows): util.setstatus('converting to numpy array') A = numpy.array(rows) localm = A.shape[0] util.setstatus('generating local Q of size %i-by-%i'%(localm,localm)) Q = numpy.linalg.qr(numpy.random.randn(localm,localm))[0] # just the Q factor util.setstatus( 'multiplying %i-by-%i A by %i-by-%i Q'%(localm,A.shape[1],localm,localm)) A = Q.dot(A) util.setstatus('outputting') for row in A: yield util.array2list(row)
def setstatus(self, msg): setstatus(msg)