Exemple #1
0
def starter(args, launch=True):
    """ The function that calls hadoopy.launch_frozen """
    gopts.args = args

    mat = args.get('mat', None)
    if mat is None:
        raise NameError("'mat' option not specified on the command line")

    input = mat
    matname, matext = os.path.splitext(mat)

    gopts.getintkey('blocksize', 3)
    schedule = gopts.getstrkey('reduce_schedule', '1')

    # clear the output
    output = args.get('output', '%s-qrr%s' % (matname, matext))
    if hadoopy.exists(output):
        print "Removing %s" % (output)
        hadoopy.rm(output)

    outputnamefunc = lambda x: output + "_iter%i" % (x)
    steps = schedule.split(',')

    jobconfs = []

    # determine the split size
    if 'split_size' in args:
        splitsize = args['split_size']
        jobconfs.append('mapreduce.input.fileinputformat.split.minsize=' +
                        str(splitsize))

    for i, step in enumerate(steps):
        if i > 0:
            input = curoutput
            mapper = 'org.apache.hadoop.mapred.lib.IdentityMapper'
        else:
            mapper = True  # use the command line mapper

        if i + 1 == len(steps):
            curoutput = output
        else:
            curoutput = output + "_iter%i" % (i + 1)
            if hadoopy.exists(curoutput):
                hadoopy.rm(curoutput)

        gopts.setkey('iter', i)

        if launch:
            hadoopy.launch_frozen(input,
                                  curoutput,
                                  __file__,
                                  mapper=mapper,
                                  cmdenvs=gopts.cmdenv(),
                                  num_reducers=int(step),
                                  jobconfs=jobconfs)
Exemple #2
0
def starter(args, launch=True):
    """ The function that calls hadoopy.launch_frozen """
    gopts.args = args

    mat = args.get('mat', None)
    if mat is None:
        raise NameError("'mat' option not specified on the command line")

    input = mat
    matname, matext = os.path.splitext(mat)

    gopts.getintkey('blocksize', 3)
    schedule = gopts.getstrkey('reduce_schedule', '1')

    # clear the output
    output = args.get('output', '%s-normal%s' % (matname, matext))
    if hadoopy.exists(output):
        print "Removing %s" % (output)
        hadoopy.rm(output)

    outputnamefunc = lambda x: output + "_iter%i" % (x)
    steps = schedule.split(',')

    for i, step in enumerate(steps):
        if i > 0:
            input = curoutput

        if i + 1 == len(steps):
            curoutput = output
        else:
            curoutput = output + "_iter%i" % (i + 1)
            if hadoopy.exists(curoutput):
                hadoopy.rm(curoutput)

        gopts.setkey('iter', i)

        if launch:
            if i > 0:
                mapper = "org.apache.hadoop.mapred.lib.IdentityMapper"
                hadoopy.launch_frozen(input,
                                      curoutput,
                                      __file__,
                                      mapper=mapper,
                                      cmdenvs=gopts.cmdenv(),
                                      num_reducers=int(step))
            else:
                hadoopy.launch_frozen(input,
                                      curoutput,
                                      __file__,
                                      cmdenvs=gopts.cmdenv(),
                                      num_reducers=int(step))
Exemple #3
0
def starter(args, launch=True):
    """ The function that calls hadoopy.launch_frozen """
    gopts.args = args
    
    mat = args.get('mat',None)
    if mat is None:
        raise NameError("'mat' option not specified on the command line")
        
    input = mat
    matname,matext = os.path.splitext(mat)
    
    gopts.getintkey('blocksize',3)
    schedule = gopts.getstrkey('reduce_schedule','1')

    # clear the output
    output = args.get('output','%s-normal%s'%(matname,matext))
    if hadoopy.exists(output):
        print "Removing %s"%(output)
        hadoopy.rm(output)
    
    outputnamefunc = lambda x: output+"_iter%i"%(x)
    steps = schedule.split(',')
        
    for i,step in enumerate(steps):
        if i>0:
            input = curoutput
            
        if i+1==len(steps):
            curoutput = output
        else:
            curoutput = output+"_iter%i"%(i+1)
            if hadoopy.exists(curoutput):
                hadoopy.rm(curoutput)
            
        gopts.setkey('iter',i)
            
        if launch:
            if i>0:
                mapper="org.apache.hadoop.mapred.lib.IdentityMapper"
                hadoopy.launch_frozen(input, curoutput, __file__, 
                    mapper=mapper,
                    cmdenvs=gopts.cmdenv(), num_reducers=int(step))
            else:
                hadoopy.launch_frozen(input, curoutput, __file__, 
                    cmdenvs=gopts.cmdenv(), num_reducers=int(step))
Exemple #4
0
            cur_args.extend(['-mapper', "'./tsqr map %i'"%(blocksize)])    
        
        if i+1==len(steps):
            curoutput = output
        else:
            curoutput = output+"_iter%i"%(i+1)
            
        
        cur_args.extend(['-jobconf',"'mapreduce.job.name="+jobname+
            " (%i/%i)'"%(i+1,len(steps))])
        cur_args.extend(['-input',"'"+input+"'"])
        cur_args.extend(['-output',"'"+curoutput+"'"])
        cur_args.extend(['-numReduceTasks', "'%i'"%(int(step))])
    
        cmd = ['hadoop','jar',streaming_jar]
        cmd.extend(cur_args)
    
        print "Running Hadoop Command:"
        print
        print ' '.join(cmd) 
        print
        print "End Hadoop Command"
        
        if hadoopy.exists(curoutput):
            print "Removing %s"%(curoutput)
            hadoopy.rm(curoutput)

        subprocess.check_call(' '.join(cmd),shell=True)