def starter(args, launch=True): """ The function that calls hadoopy.launch_frozen """ gopts.args = args mat = args.get('mat', None) if mat is None: raise NameError("'mat' option not specified on the command line") input = mat matname, matext = os.path.splitext(mat) gopts.getintkey('blocksize', 3) schedule = gopts.getstrkey('reduce_schedule', '1') # clear the output output = args.get('output', '%s-qrr%s' % (matname, matext)) if hadoopy.exists(output): print "Removing %s" % (output) hadoopy.rm(output) outputnamefunc = lambda x: output + "_iter%i" % (x) steps = schedule.split(',') jobconfs = [] # determine the split size if 'split_size' in args: splitsize = args['split_size'] jobconfs.append('mapreduce.input.fileinputformat.split.minsize=' + str(splitsize)) for i, step in enumerate(steps): if i > 0: input = curoutput mapper = 'org.apache.hadoop.mapred.lib.IdentityMapper' else: mapper = True # use the command line mapper if i + 1 == len(steps): curoutput = output else: curoutput = output + "_iter%i" % (i + 1) if hadoopy.exists(curoutput): hadoopy.rm(curoutput) gopts.setkey('iter', i) if launch: hadoopy.launch_frozen(input, curoutput, __file__, mapper=mapper, cmdenvs=gopts.cmdenv(), num_reducers=int(step), jobconfs=jobconfs)
def starter(args, launch=True): """ The function that calls hadoopy.launch_frozen """ gopts.args = args mat = args.get('mat', None) if mat is None: raise NameError("'mat' option not specified on the command line") input = mat matname, matext = os.path.splitext(mat) gopts.getintkey('blocksize', 3) schedule = gopts.getstrkey('reduce_schedule', '1') # clear the output output = args.get('output', '%s-normal%s' % (matname, matext)) if hadoopy.exists(output): print "Removing %s" % (output) hadoopy.rm(output) outputnamefunc = lambda x: output + "_iter%i" % (x) steps = schedule.split(',') for i, step in enumerate(steps): if i > 0: input = curoutput if i + 1 == len(steps): curoutput = output else: curoutput = output + "_iter%i" % (i + 1) if hadoopy.exists(curoutput): hadoopy.rm(curoutput) gopts.setkey('iter', i) if launch: if i > 0: mapper = "org.apache.hadoop.mapred.lib.IdentityMapper" hadoopy.launch_frozen(input, curoutput, __file__, mapper=mapper, cmdenvs=gopts.cmdenv(), num_reducers=int(step)) else: hadoopy.launch_frozen(input, curoutput, __file__, cmdenvs=gopts.cmdenv(), num_reducers=int(step))
def starter(args, launch=True): """ The function that calls hadoopy.launch_frozen """ gopts.args = args mat = args.get('mat',None) if mat is None: raise NameError("'mat' option not specified on the command line") input = mat matname,matext = os.path.splitext(mat) gopts.getintkey('blocksize',3) schedule = gopts.getstrkey('reduce_schedule','1') # clear the output output = args.get('output','%s-normal%s'%(matname,matext)) if hadoopy.exists(output): print "Removing %s"%(output) hadoopy.rm(output) outputnamefunc = lambda x: output+"_iter%i"%(x) steps = schedule.split(',') for i,step in enumerate(steps): if i>0: input = curoutput if i+1==len(steps): curoutput = output else: curoutput = output+"_iter%i"%(i+1) if hadoopy.exists(curoutput): hadoopy.rm(curoutput) gopts.setkey('iter',i) if launch: if i>0: mapper="org.apache.hadoop.mapred.lib.IdentityMapper" hadoopy.launch_frozen(input, curoutput, __file__, mapper=mapper, cmdenvs=gopts.cmdenv(), num_reducers=int(step)) else: hadoopy.launch_frozen(input, curoutput, __file__, cmdenvs=gopts.cmdenv(), num_reducers=int(step))
cur_args.extend(['-mapper', "'./tsqr map %i'"%(blocksize)]) if i+1==len(steps): curoutput = output else: curoutput = output+"_iter%i"%(i+1) cur_args.extend(['-jobconf',"'mapreduce.job.name="+jobname+ " (%i/%i)'"%(i+1,len(steps))]) cur_args.extend(['-input',"'"+input+"'"]) cur_args.extend(['-output',"'"+curoutput+"'"]) cur_args.extend(['-numReduceTasks', "'%i'"%(int(step))]) cmd = ['hadoop','jar',streaming_jar] cmd.extend(cur_args) print "Running Hadoop Command:" print print ' '.join(cmd) print print "End Hadoop Command" if hadoopy.exists(curoutput): print "Removing %s"%(curoutput) hadoopy.rm(curoutput) subprocess.check_call(' '.join(cmd),shell=True)