Example #1
0
 def setUpClass(self):
     from fastlmm.util.util import create_directory_if_necessary
     create_directory_if_necessary(self.tempout_dir, isfile=False)
     self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))
     self.bedbase = os.path.join(self.pythonpath, 'tests/datasets/all_chr.maf0.001.N300')
     self.phen_fn = os.path.join(self.pythonpath, 'tests/datasets/phenSynthFrom22.23.N300.randcidorder.txt')
     self.cov_fn = os.path.join(self.pythonpath,  'tests/datasets/all_chr.maf0.001.covariates.N300.txt')
Example #2
0
    def run(self, distributable):
        # Check that the local machine has python path set
        localpythonpath = os.environ.get("PYTHONPATH")#!!should it be able to work without pythonpath being set (e.g. if there was just one file)? Also, is None really the return or is it an exception.
        if localpythonpath is None: raise Exception("Expect local machine to have 'pythonpath' set")

        remotepythoninstall = self.check_remote_pythoninstall()

        remotewd, run_dir_abs, run_dir_rel, nodelocalwd = self.create_run_dir()
        util.create_directory_if_necessary(os.path.join(remotewd, distributable.tempdirectory), isfile=False) #create temp directory now so that cluster tasks won't try to create it many times at once
        result_remote = os.path.join(run_dir_abs,"result.p")

        self.copy_python_settings(run_dir_abs)

        inputOutputCopier = HPCCopier(remotewd,skipinput=self.skipinputcopy) #Create the object that copies input and output files to where they are needed

        inputOutputCopier.input(distributable) # copy of the input files to where they are needed (i.e. the cluster)

        remotepythonpath = self.FindOrCreateRemotePythonPath(localpythonpath, run_dir_abs)

        batfilename_rel = self.create_bat_file(distributable, remotepythoninstall, remotepythonpath, remotewd, run_dir_abs, run_dir_rel, result_remote, nodelocalwd, distributable)

        self.submit_to_cluster(batfilename_rel, distributable, remotewd, run_dir_abs, run_dir_rel, nodelocalwd)

        inputOutputCopier.output(distributable) # copy the output file from where they were created (i.e. the cluster) to the local computer

        assert os.path.exists(result_remote), "The HPC job produced no result (and, thus, likely failed)"
        with open(result_remote, mode='rb') as f:
            result = pickle.load(f)

        #logging.info('Done: HPC runner is running a distributable. Returns {0}'.format(result))
        return result
 def setUpClass(self):
     from fastlmm.util.util import create_directory_if_necessary
     create_directory_if_necessary(self.tempout_dir, isfile=False)
     self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))
     self.bedbase = os.path.join(self.pythonpath, 'tests/datasets/all_chr.maf0.001.N300')
     self.phen_fn = os.path.join(self.pythonpath, 'tests/datasets/phenSynthFrom22.23.N300.randcidorder.txt')
     self.cov_fn = os.path.join(self.pythonpath,  'tests/datasets/all_chr.maf0.001.covariates.N300.txt')
Example #4
0
    def run(self, distributable):
        # Check that the local machine has python path set
        localpythonpath = os.environ.get("PYTHONPATH")#!!should it be able to work without pythonpath being set (e.g. if there was just one file)? Also, is None really the return or is it an exception.
        if localpythonpath is None: raise Exception("Expect local machine to have 'pythonpath' set")

        remotepythoninstall = self.check_remote_pythoninstall()

        remotewd, run_dir_abs, run_dir_rel, nodelocalwd = self.create_run_dir()
        util.create_directory_if_necessary(os.path.join(remotewd, distributable.tempdirectory), isfile=False) #create temp directory now so that cluster tasks won't try to create it many times at once
        result_remote = os.path.join(run_dir_abs,"result.p")

        self.copy_python_settings(run_dir_abs)

        inputOutputCopier = HPCCopier(remotewd,skipinput=self.skipinputcopy) #Create the object that copies input and output files to where they are needed

        inputOutputCopier.input(distributable) # copy of the input files to where they are needed (i.e. the cluster)

        remotepythonpath = self.FindOrCreateRemotePythonPath(localpythonpath, run_dir_abs)

        batfilename_rel = self.create_bat_file(distributable, remotepythoninstall, remotepythonpath, remotewd, run_dir_abs, run_dir_rel, result_remote, nodelocalwd, distributable)

        self.submit_to_cluster(batfilename_rel, distributable, remotewd, run_dir_abs, run_dir_rel, nodelocalwd)

        inputOutputCopier.output(distributable) # copy the output file from where they were created (i.e. the cluster) to the local computer

        assert os.path.exists(result_remote), "The HPC job produced no result (and, thus, likely failed)"
        with open(result_remote, mode='rb') as f:
            result = pickle.load(f)

        #logging.info('Done: HPC runner is running a distributable. Returns {0}'.format(result))
        return result
 def setUpClass(self):
     from fastlmm.util.util import create_directory_if_necessary
     create_directory_if_necessary(self.tempout_dir, isfile=False)
     self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))
     self.bedbase = os.path.join(self.pythonpath, 'fastlmm/feature_selection/examples/toydata.5chrom')
     self.phen_fn = os.path.join(self.pythonpath, 'fastlmm/feature_selection/examples/toydata.phe')
     self.cov_fn = os.path.join(self.pythonpath,  'fastlmm/feature_selection/examples/toydata.cov')
Example #6
0
def run_one_task(original_distributable, taskindex, taskcount, workdirectory):
    '''
    Does a fraction of the work (e.g. 1 of every 1000 work items) and then saves the results to single file.
    if taskindex == taskcount, does the reduce step
    '''

    if not 0 < taskcount: raise Exception("Expect taskcount to be positive")
    if not (0 <= taskindex and taskindex < taskcount + 1):
        raise Exception(
            "Expect taskindex to be between 0 (inclusive) and taskcount (exclusive)"
        )

    shaped_distributable = shape_to_desired_workcount(original_distributable,
                                                      taskcount)

    if shaped_distributable.work_count != taskcount:
        raise Exception("Assert: expect workcount == taskcount")

    util.create_directory_if_necessary(workdirectory,
                                       isfile=False,
                                       robust=True)

    if (taskindex < taskcount):
        doMainWorkForOneIndex(shaped_distributable, taskcount, taskindex,
                              workdirectory)
        return None
    else:
        result_sequence = work_sequence_from_disk(workdirectory, taskcount)
        return shaped_distributable.reduce(result_sequence)
Example #7
0
 def reduce(self, result_sequence):
     '''
     '''
     for i, pcs in result_sequence:
         out_fn = self.create_out_fn(self.cache_prefix, i)
         util.create_directory_if_necessary(out_fn)
         save(out_fn, pcs)
     return None
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))

        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all")
        self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")
Example #9
0
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))

        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all")
        self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")
Example #10
0
 def reduce(self, result_sequence):
     '''
     '''
     for i, pcs in result_sequence:
         out_fn = self.create_out_fn(self.cache_prefix, i)
         util.create_directory_if_necessary(out_fn)
         save(out_fn, pcs)
     return None
Example #11
0
 def create_distributablep(self, distributable, run_dir_abs, run_dir_rel):
     logging.info('Hadoop runner is pickling distributable')
     distributablep_filename_rel = os.path.join(run_dir_rel, "distributable.p")
     #distributablep_filename_abs = os.path.join(run_dir_abs, "distributable.p")
     util.create_directory_if_necessary(distributablep_filename_rel)
     with open(distributablep_filename_rel, mode='wb') as f:
         pickle.dump(distributable, f, pickle.HIGHEST_PROTOCOL)
     logging.info('Done: Hadoop runner is pickling distributable')
     return distributablep_filename_rel
Example #12
0
 def create_distributablep(self, distributable, run_dir_abs, run_dir_rel):
     logging.info('Hadoop runner is pickling distributable')
     distributablep_filename_rel = os.path.join(run_dir_rel, "distributable.p")
     #distributablep_filename_abs = os.path.join(run_dir_abs, "distributable.p")
     util.create_directory_if_necessary(distributablep_filename_rel)
     with open(distributablep_filename_rel, mode='wb') as f:
         pickle.dump(distributable, f, pickle.HIGHEST_PROTOCOL)
     logging.info('Done: Hadoop runner is pickling distributable')
     return distributablep_filename_rel
Example #13
0
    def run(self, distributable):
        JustCheckExists().input(distributable)

        localpath = os.environ["PATH"]
        localwd = os.getcwd()

        import datetime
        now = datetime.datetime.now()
        run_dir_rel = os.path.join("runs", util.datestamp(appendrandom=True))
        run_dir_abs = os.path.join(localwd, run_dir_rel)
        util.create_directory_if_necessary(run_dir_rel, isfile=False)

        distributablep_filename = os.path.join(run_dir_rel, "distributable.p")
        with open(distributablep_filename, mode='wb') as f:
            pickle.dump(distributable, f, pickle.HIGHEST_PROTOCOL)

        distributable_py_file = os.path.join(os.path.dirname(__file__), "..",
                                             "distributable.py")
        if not os.path.exists(distributable_py_file):
            raise Exception("Expect file at " + distributable_py_file +
                            ", but it doesn't exist.")
        command_format_string = sys.executable + " " + distributable_py_file + " " + distributablep_filename + " LocalInParts({0},{1},mkl_num_threads={2})".format(
            "{0}", self.taskcount, self.mkl_num_threads)

        if not self.just_one_process:
            proc_list = []
            for taskindex in range(self.taskcount):
                command_string = command_format_string.format(taskindex)
                proc = subprocess.Popen(
                    command_string.split(" "), cwd=os.getcwd()
                )  #!!!bug: If Anaconda is installed in c:\program files\anaconda2 this will fail
                proc_list.append(proc)

            for taskindex, proc in enumerate(proc_list):
                rc = proc.wait()
                #for line in proc.stdout.readlines():
                #    sys.stdout.write(line)
                if not 0 == rc:
                    raise Exception(
                        "Running python in python results in non-zero return code in task#{0}"
                        .format(taskindex))
        else:
            from fastlmm.util.runner import LocalInParts
            for taskindex in range(self.taskcount):
                LocalInParts(
                    taskindex,
                    self.taskcount,
                    mkl_num_threads=self.mkl_num_threads).run(distributable)

        result = run_one_task(distributable, self.taskcount, self.taskcount,
                              distributable.tempdirectory)

        JustCheckExists().output(distributable)
        return result
Example #14
0
 def FindOrCreateRemotePythonPath(self, localpythonpath, run_dir_abs):
     if self.remote_python_parent is None:
         remotepythonpath = self.CopySource(localpythonpath, run_dir_abs)
     else:
         util.create_directory_if_necessary(self.remote_python_parent,isfile=False)
         list = []
         for rel in os.listdir(self.remote_python_parent):
             list.append(os.path.join(self.remote_python_parent,rel))
         remotepythonpath = ";".join(list)
         if self.update_remote_python_parent:
             remotepythonpath = self.CopySource(localpythonpath, run_dir_abs)
     
     return remotepythonpath
Example #15
0
 def output(self,item):
     if isinstance(item, str):
         itemnorm = os.path.normpath(item)
         util.create_directory_if_necessary(itemnorm)
         remote_file_name = os.path.join(self.remotewd,itemnorm)
         local_dir_name,ignore = os.path.split(itemnorm)
         #xcopycommand = "xcopy /d /e /s /c /h /y {0} {1}".format(remote_file_name, local_dir_name) # we copy to the local dir instead of the local file so that xcopy won't ask 'file or dir?'
         xcopycommand = "xcopy /d /c /y {0} {1}".format(remote_file_name, local_dir_name) # we copy to the local 
         logging.info(xcopycommand)
         rc = os.system(xcopycommand)
         if rc!=0: logging.info("xcopy cmd failed with return value={0}, from cmd {1}".format(rc,xcopycommand))
     elif hasattr(item,"copyoutputs"):
         item.copyoutputs(self)
Example #16
0
 def setUpClass(self):
     from fastlmm.util.util import create_directory_if_necessary
     create_directory_if_necessary(self.tempout_dir, isfile=False)
     self.pythonpath = os.path.abspath(
         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                      "..", ".."))
     self.bedbase = os.path.join(
         self.pythonpath,
         'fastlmm/feature_selection/examples/toydata.5chrom')
     self.phen_fn = os.path.join(
         self.pythonpath, 'fastlmm/feature_selection/examples/toydata.phe')
     self.cov_fn = os.path.join(
         self.pythonpath, 'fastlmm/feature_selection/examples/toydata.cov')
Example #17
0
 def FindOrCreateRemotePythonPath(self, localpythonpath, run_dir_abs):
     if self.remote_python_parent is None:
         remotepythonpath = self.CopySource(localpythonpath, run_dir_abs)
     else:
         util.create_directory_if_necessary(self.remote_python_parent,isfile=False)
         list = []
         for rel in os.listdir(self.remote_python_parent):
             list.append(os.path.join(self.remote_python_parent,rel))
         remotepythonpath = ";".join(list)
         if self.update_remote_python_parent:
             remotepythonpath = self.CopySource(localpythonpath, run_dir_abs)
     
     return remotepythonpath
Example #18
0
    def run(self, distributable):
        tempdir = os.path.join(self.run_dir,distributable.tempdirectory)
        if self.taskindex != self.taskcount:
            JustCheckExists().input(distributable)
            return run_one_task(distributable, self.taskindex, self.taskcount, tempdir)
        else:
            result = run_one_task(distributable, self.taskindex, self.taskcount, tempdir)
            if self.result_file is not None:
                util.create_directory_if_necessary(self.result_file)
                with open(self.result_file, mode='wb') as f:
                    pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)

            return result
Example #19
0
File: test.py Project: omerwe/LEAP
	def setUpClass(self):
		from fastlmm.util.util import create_directory_if_necessary
		create_directory_if_necessary(self.tempout_dir, isfile=False)
		self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__))))
		self.bedbase = os.path.join(self.pythonpath, 'dataset1/dataset1')		
		self.phen_fn = os.path.join(self.pythonpath, 'dataset1/dataset1.phe')
		
		#Create eigendecompositions
		logging.info("Creating eigendecomposition files")		
		for i in range(1,11):
			output_file = os.path.abspath(os.path.join(self.tempout_dir, 'dataset1_nochr{}.npz'.format(i)))
			extractSim = 'dataset1/extracts/nochr{0}_extract.txt'.format(i)
			bed, _ = leapUtils.loadData(self.bedbase, extractSim, self.phen_fn, loadSNPs=True)
			leapMain.eigenDecompose(bed, output_file)
Example #20
0
 def output(self,item):
     if isinstance(item, str):
         itemnorm = os.path.normpath(item)
         util.create_directory_if_necessary(itemnorm)
         remote_file_name = os.path.join(self.remotewd,itemnorm)
         local_dir_name,ignore = os.path.split(itemnorm)
         assert os.path.exists(remote_file_name), "Don't see expected file '{0}'. Did the HPC job fail?".format(remote_file_name)
         #xcopycommand = "xcopy /d /e /s /c /h /y {0} {1}".format(remote_file_name, local_dir_name) # we copy to the local dir instead of the local file so that xcopy won't ask 'file or dir?'
         xcopycommand = "xcopy /d /c /y {0} {1}".format(remote_file_name, local_dir_name) # we copy to the local 
         logging.info(xcopycommand)
         rc = os.system(xcopycommand)
         if rc!=0: logging.info("xcopy cmd failed with return value={0}, from cmd {1}".format(rc,xcopycommand))
     elif hasattr(item,"copyoutputs"):
         item.copyoutputs(self)
Example #21
0
	def setUpClass(self):
		from fastlmm.util.util import create_directory_if_necessary
		create_directory_if_necessary(self.tempout_dir, isfile=False)
		self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__))))
		self.bedbase = os.path.join(self.pythonpath, 'dataset1/dataset1')		
		self.phen_fn = os.path.join(self.pythonpath, 'dataset1/dataset1.phe')
		
		#Create eigendecompositions
		logging.info("Creating eigendecomposition files")		
		for i in xrange(1,11):
			output_file = os.path.abspath(os.path.join(self.tempout_dir, 'dataset1_nochr{}.npz'.format(i)))
			extractSim = 'dataset1/extracts/nochr{0}_extract.txt'.format(i)
			bed, _ = leapUtils.loadData(self.bedbase, extractSim, self.phen_fn, loadSNPs=True)
			leapMain.eigenDecompose(bed, output_file)
Example #22
0
 def create_run_dir(self):
     username = os.environ["USERNAME"]
     localwd = os.getcwd()
     #!!make an option to specify the full remote WD. Also what is the "\\\\" case for?
     if localwd.startswith("\\\\"):
         remotewd = self.fileshare + os.path.sep + username +os.path.sep + "\\".join(localwd.split('\\')[4:])
     else:
         remotewd = self.fileshare + os.path.sep + username + os.path.splitdrive(localwd)[1]  #using '+' because 'os.path.join' isn't work with shares
     import datetime
     now = datetime.datetime.now()
     run_dir_rel = os.path.join("runs",util.datestamp(appendrandom=True))
     run_dir_abs = os.path.join(remotewd,run_dir_rel)
     util.create_directory_if_necessary(run_dir_abs,isfile=False)
     return remotewd, run_dir_abs, run_dir_rel
Example #23
0
 def input(self,item):
     if self.skipinput:
         return
     if isinstance(item, str):
         itemnorm = os.path.normpath(item)
         remote_file_name = os.path.join(self.remotewd,itemnorm)
         remote_dir_name,ignore = os.path.split(remote_file_name)
         util.create_directory_if_necessary(remote_file_name)
         xcopycommand = "xcopy /d /e /s /c /h /y {0} {1}".format(itemnorm, remote_dir_name)
         logging.info(xcopycommand)
         rc = os.system(xcopycommand)
         print("rc=" +str(rc))
         if rc!=0: raise Exception("xcopy cmd failed with return value={0}, from cmd {1}".format(rc,xcopycommand))
     elif hasattr(item,"copyinputs"):
         item.copyinputs(self)
Example #24
0
 def input(self,item):
     if self.skipinput:
         return
     if isinstance(item, str):
         itemnorm = os.path.normpath(item)
         remote_file_name = os.path.join(self.remotewd,itemnorm)
         remote_dir_name,ignore = os.path.split(remote_file_name)
         util.create_directory_if_necessary(remote_file_name)
         xcopycommand = "xcopy /d /e /s /c /h /y {0} {1}".format(itemnorm, remote_dir_name)
         logging.info(xcopycommand)
         rc = os.system(xcopycommand)
         print "rc=" +str(rc)
         if rc!=0: raise Exception("xcopy cmd failed with return value={0}, from cmd {1}".format(rc,xcopycommand))
     elif hasattr(item,"copyinputs"):
         item.copyinputs(self)
Example #25
0
    def run(self, distributable):
        if self.taskindex != self.taskcount:
            JustCheckExists().input(distributable)
            return run_one_task(distributable, self.taskindex, self.taskcount,
                                distributable.tempdirectory)
        else:
            result = run_one_task(distributable, self.taskindex,
                                  self.taskcount, distributable.tempdirectory)
            JustCheckExists().output(distributable)

            if self.result_file is not None:
                util.create_directory_if_necessary(self.result_file)
                with open(self.result_file, mode='wb') as f:
                    pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)

            return result
Example #26
0
    def run(self, distributable):
        JustCheckExists().input(distributable)

        localpath = os.environ["PATH"]
        localwd = os.getcwd()

        import datetime

        now = datetime.datetime.now()
        run_dir_rel = os.path.join("runs", util.datestamp(appendrandom=True))
        run_dir_abs = os.path.join(localwd, run_dir_rel)
        util.create_directory_if_necessary(run_dir_rel, isfile=False)

        distributablep_filename = os.path.join(run_dir_rel, "distributable.p")
        with open(distributablep_filename, mode="wb") as f:
            pickle.dump(distributable, f, pickle.HIGHEST_PROTOCOL)

        distributable_py_file = os.path.join(os.path.dirname(__file__), "..", "distributable.py")
        if not os.path.exists(distributable_py_file):
            raise Exception("Expect file at " + distributable_py_file + ", but it doesn't exist.")
        command_format_string = (
            sys.executable
            + " "
            + distributable_py_file
            + " "
            + distributablep_filename
            + " LocalInParts({0},{1},mkl_num_threads={2})".format("{0}", self.taskcount, self.mkl_num_threads)
        )

        proc_list = []
        for taskindex in xrange(self.taskcount):
            command_string = command_format_string.format(taskindex)
            proc = subprocess.Popen(command_string.split(" "), cwd=os.getcwd())
            proc_list.append(proc)

        for taskindex, proc in enumerate(proc_list):
            rc = proc.wait()
            # for line in proc.stdout.readlines():
            #    sys.stdout.write(line)
            if not 0 == rc:
                raise Exception(
                    "Running python in python results in non-zero return code in task#{0}".format(taskindex)
                )

        result = run_one_task(distributable, self.taskcount, self.taskcount, distributable.tempdirectory)
        JustCheckExists().output(distributable)
        return result
Example #27
0
 def create_run_dir(self):
     username = os.environ["USERNAME"]
     localwd = os.getcwd()
     #!!make an option to specify the full remote WD. Also what is the "\\\\" case for?
     if localwd.startswith("\\\\"):
         remotewd = self.fileshare + os.path.sep + username + os.path.sep + "\\".join(
             localwd.split('\\')[4:])
     else:
         remotewd = self.fileshare + os.path.sep + username + os.path.splitdrive(
             localwd
         )[1]  #using '+' because 'os.path.join' isn't work with shares
     import datetime
     now = datetime.datetime.now()
     run_dir_rel = os.path.join("runs", util.datestamp(appendrandom=True))
     run_dir_abs = os.path.join(remotewd, run_dir_rel)
     util.create_directory_if_necessary(run_dir_abs, isfile=False)
     return remotewd, run_dir_abs, run_dir_rel
Example #28
0
    def create_bat_file(self, distributable, remotepythonpath, remotewd, run_dir_abs, run_dir_rel, result_remote, result_hdfs):
        logging.info('Hadoop runner is creating bat file')

        outFileList = Hadoop.RecursivelyGetAllOutputs(distributable)

        distributablep_filename_rel = self.create_distributablep(distributable, run_dir_abs, run_dir_rel)

        distributable_py_file = os.path.join(os.path.dirname(__file__),"..","distributable.py")
        if not os.path.exists(distributable_py_file): raise Exception("Expect file at " + distributable_py_file + ", but it doesn't exist.")
        localfilepath, file = os.path.split(distributable_py_file)
        remoteexepath = os.path.join(remotepythonpath.split(';')[0],"fastlmm","util") #!!shouldn't need to assume where the file is in source

        batfilename_abs_list = []
        for part in ["Mapper","Reducer"]:
            command_string = remoteexepath + os.path.sep + file + r""" distributable.p "Local{0}({1},""{2}"",mkl_num_threads={3},logging_handler=logging.StreamHandler())" """.format(
                    part,
                    self.taskcount,
                    result_remote.replace("\\","/"), #change to DOS separator to Unix separator because python will work with either and this avoid problems with parsing the batch file
                    self.mkl_num_threads)
            batfilename_rel = os.path.join(run_dir_rel,"dist{0}.bat".format(part))
            batfilename_abs = "hdfs:" + os.path.join(run_dir_abs,"dist{0}.bat".format(part)).replace("\\","/")
            batfilename_abs_list.append(batfilename_abs)
            util.create_directory_if_necessary(batfilename_rel, isfile=True)
            with open(batfilename_rel, "w") as batfile:
                batfile.write("@set path={0};{0}\Scripts;%path%\n".format(r"c:\GCD\esciencepy"))
                batfile.write("@set PYTHONPATH={0}\n".format(remotepythonpath))
                batfile.write("@set home=%cd%\n")
                #batfile.write("@mklink /d .continuum continuum\n")
                #batfile.write("@dir /s\n")
                #batfile.write("@set R_HOME={0}\n".format(os.path.join(remotepythoninstall,"R-2.15.2")))
                #batfile.write("@set R_USER={0}\n".format("."))
                batfile.write("@mkdir {0}\n@mkdir {0}\\tex.cache\n@set MPLCONFIGDIR={0}\n".format(".matplotlib"))
                batfile.write("@mkdir {0}\nset IPYTHONDIR={0}\n".format(".ipython"))
                #batfile.write("xcopy /d /e /s /c /h /i continuum .continuum\n")
                batfile.write("@call python {0}\n".format(command_string))
                if part == "Reducer":
                    batfile.write("@call %HADOOP_HOME%\\bin\\hadoop fs -rm {0} -skipTrash\n".format(result_hdfs))
                    batfile.write("@call %HADOOP_HOME%\\bin\\hadoop fs -copyFromLocal {0} {1}\n".format(result_remote, result_hdfs))
                    for outfile in outFileList:
                        hdfsOutFile = remotewd + "/" + outfile
                        batfile.write("@call %HADOOP_HOME%\\bin\\hadoop fs -rm {0}\n".format(hdfsOutFile))
                        batfile.write("@call %HADOOP_HOME%\\bin\\hadoop fs -copyFromLocal {0} {1}\n".format(outfile, hdfsOutFile))
        picklefilename_abs = "hdfs:" + os.path.join(run_dir_abs,"distributable.p").replace("\\","/")
        batfilename_abs_list.append(picklefilename_abs)
        logging.info('Done: Hadoop runner is creating bat file')
        return batfilename_abs_list
Example #29
0
    def run(self, original_distributable):
        result_sequence = self.work_sequence_from_stdin()
        shaped_distributable = shape_to_desired_workcount(original_distributable, self.taskcount)
        if shaped_distributable.work_count != self.taskcount : raise Exception("Assert: expect workcount == taskcount")
        result = shaped_distributable.reduce(result_sequence)
        #close the instream if it is a file?

        #Check that all expected output files are there
        JustCheckExists(doPrintOutputNames=True).output(original_distributable)

        #Pickle the result to a file
        #logging.info("AAA\n\n\n\nABCwd='{0}'\n\nfile='{1}'DEF\n\n\nZZZ".format(os.getcwd(),self.output_file))
        if self.result_file is not None:
            util.create_directory_if_necessary(self.result_file)
            with open(self.result_file, mode='wb') as f:
                pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)
        return result
Example #30
0
    def create_bat_file(self, distributable, remotepythonpath, remotewd, run_dir_abs, run_dir_rel, result_remote, result_hdfs):
        logging.info('Hadoop runner is creating bat file')

        outFileList = Hadoop.RecursivelyGetAllOutputs(distributable)

        distributablep_filename_rel = self.create_distributablep(distributable, run_dir_abs, run_dir_rel)

        distributable_py_file = os.path.join(os.path.dirname(__file__),"..","distributable.py")
        if not os.path.exists(distributable_py_file): raise Exception("Expect file at " + distributable_py_file + ", but it doesn't exist.")
        localfilepath, file = os.path.split(distributable_py_file)
        remoteexepath = os.path.join(remotepythonpath.split(';')[0],"fastlmm","util") #!!shouldn't need to assume where the file is in source

        batfilename_abs_list = []
        for part in ["Mapper","Reducer"]:
            command_string = remoteexepath + os.path.sep + file + r""" distributable.p "Local{0}({1},""{2}"",mkl_num_threads={3},logging_handler=logging.StreamHandler())" """.format(
                    part,
                    self.taskcount,
                    result_remote.replace("\\","/"), #change to DOS separator to Unix separator because python will work with either and this avoid problems with parsing the batch file
                    self.mkl_num_threads)
            batfilename_rel = os.path.join(run_dir_rel,"dist{0}.bat".format(part))
            batfilename_abs = "hdfs:" + os.path.join(run_dir_abs,"dist{0}.bat".format(part)).replace("\\","/")
            batfilename_abs_list.append(batfilename_abs)
            util.create_directory_if_necessary(batfilename_rel, isfile=True)
            with open(batfilename_rel, "w") as batfile:
                batfile.write("@set path={0};{0}\Scripts;%path%\n".format(r"c:\GCD\esciencepy"))
                batfile.write("@set PYTHONPATH={0}\n".format(remotepythonpath))
                batfile.write("@set home=%cd%\n")
                #batfile.write("@mklink /d .continuum continuum\n")
                #batfile.write("@dir /s\n")
                #batfile.write("@set R_HOME={0}\n".format(os.path.join(remotepythoninstall,"R-2.15.2")))
                #batfile.write("@set R_USER={0}\n".format("."))
                batfile.write("@mkdir {0}\n@mkdir {0}\\tex.cache\n@set MPLCONFIGDIR={0}\n".format(".matplotlib"))
                batfile.write("@mkdir {0}\nset IPYTHONDIR={0}\n".format(".ipython"))
                #batfile.write("xcopy /d /e /s /c /h /i continuum .continuum\n")
                batfile.write("@call python {0}\n".format(command_string))
                if part == "Reducer":
                    batfile.write("@call %HADOOP_HOME%\\bin\\hadoop fs -rm {0} -skipTrash\n".format(result_hdfs))
                    batfile.write("@call %HADOOP_HOME%\\bin\\hadoop fs -copyFromLocal {0} {1}\n".format(result_remote, result_hdfs))
                    for outfile in outFileList:
                        hdfsOutFile = remotewd + "/" + outfile
                        batfile.write("@call %HADOOP_HOME%\\bin\\hadoop fs -rm {0}\n".format(hdfsOutFile))
                        batfile.write("@call %HADOOP_HOME%\\bin\\hadoop fs -copyFromLocal {0} {1}\n".format(outfile, hdfsOutFile))
        picklefilename_abs = "hdfs:" + os.path.join(run_dir_abs,"distributable.p").replace("\\","/")
        batfilename_abs_list.append(picklefilename_abs)
        logging.info('Done: Hadoop runner is creating bat file')
        return batfilename_abs_list
    def run(self, original_distributable):
        result_sequence = self.work_sequence_from_stdin()
        shaped_distributable = shape_to_desired_workcount(original_distributable, self.taskcount)
        if shaped_distributable.work_count != self.taskcount : raise Exception("Assert: expect workcount == taskcount")
        result = shaped_distributable.reduce(result_sequence)
        #close the instream if it is a file?

        #Check that all expected output files are there
        JustCheckExists(doPrintOutputNames=True).output(original_distributable)

        #Pickle the result to a file
        #logging.info("AAA\n\n\n\nABCwd='{0}'\n\nfile='{1}'DEF\n\n\nZZZ".format(os.getcwd(),self.output_file))
        if self.result_file is not None:
            util.create_directory_if_necessary(self.result_file)
            with open(self.result_file, mode='wb') as f:
                pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)
        return result
Example #32
0
    def create_bat_file(self, distributable, remotepythoninstall,
                        remotepythonpath, remotewd, run_dir_abs, run_dir_rel,
                        result_remote):
        path_share_list = [r"", r"Scripts"]
        remotepath_list = []
        for path_share in path_share_list:
            path_share_abs = os.path.join(remotepythoninstall, path_share)
            if not os.path.isdir(path_share_abs):
                raise Exception(
                    "Expect path directory at '{0}'".format(path_share_abs))
            remotepath_list.append(path_share_abs)
        remotepath = ";".join(remotepath_list)

        distributablep_filename_rel = self.create_distributablep(
            distributable, run_dir_abs, run_dir_rel)

        distributable_py_file = os.path.join(os.path.dirname(__file__), "..",
                                             "distributable.py")
        if not os.path.exists(distributable_py_file):
            raise Exception("Expect file at " + distributable_py_file +
                            ", but it doesn't exist.")
        localfilepath, file = os.path.split(distributable_py_file)
        remoteexepath = os.path.join(
            remotepythonpath.split(';')[0], "fastlmm",
            "util")  #!!shouldn't need to assume where the file is in source
        #run_dir_rel + os.path.sep + "pythonpath" + os.path.sep + os.path.splitdrive(localfilepath)[1]

        result_remote2 = result_remote.encode("string-escape")
        command_string = remoteexepath + os.path.sep + file + " " + distributablep_filename_rel + r""" "LocalInParts(%1,{0},mkl_num_threads={1},result_file=""{2}"") " """.format(
            self.taskcount, self.mkl_num_threads, result_remote2)
        batfilename_rel = os.path.join(run_dir_rel, "dist.bat")
        batfilename_abs = os.path.join(run_dir_abs, "dist.bat")
        util.create_directory_if_necessary(batfilename_abs, isfile=True)
        matplotlibfilename_rel = os.path.join(run_dir_rel, ".matplotlib")
        matplotlibfilename_abs = os.path.join(run_dir_abs, ".matplotlib")
        util.create_directory_if_necessary(matplotlibfilename_abs,
                                           isfile=False)
        util.create_directory_if_necessary(matplotlibfilename_abs +
                                           "/tex.cache",
                                           isfile=False)
        ipythondir_rel = os.path.join(run_dir_rel, ".ipython")
        ipythondir_abs = os.path.join(run_dir_abs, ".ipython")
        util.create_directory_if_necessary(ipythondir_abs, isfile=False)
        with open(batfilename_abs, "w") as batfile:
            batfile.write("set path={0};%path%\n".format(remotepath))
            batfile.write("set PYTHONPATH={0}\n".format(remotepythonpath))
            #batfile.write("set R_HOME={0}\n".format(os.path.join(remotepythoninstall,"R-2.15.2")))
            #batfile.write("set R_USER={0}\n".format(remotewd))
            batfile.write("set USERPROFILE={0}\n".format(run_dir_rel))
            batfile.write(
                "set MPLCONFIGDIR={0}\n".format(matplotlibfilename_rel))
            batfile.write("set IPYTHONDIR={0}\n".format(ipythondir_rel))
            batfile.write("python {0}\n".format(command_string))

        return batfilename_rel
Example #33
0
    def CopySource(self,localpythonpath, run_dir_abs):
        
        if self.update_remote_python_parent:
            remote_python_parent = self.remote_python_parent
        else:
            remote_python_parent = run_dir_abs + os.path.sep + "pythonpath"
        util.create_directory_if_necessary(remote_python_parent, isfile=False)
        remotepythonpath_list = []
        for i, localpythonpathdir in enumerate(localpythonpath.split(';')):
            remotepythonpathdir = os.path.join(remote_python_parent, str(i))
            remotepythonpath_list.append(remotepythonpathdir)
            xd_string = HPC.FindDirectoriesToExclude(localpythonpathdir)
            xcopycommand = 'robocopy /s {0} {1}{2}'.format(localpythonpathdir,remotepythonpathdir,xd_string)
            logging.info(xcopycommand)
            os.system(xcopycommand)

        remotepythonpath = ";".join(remotepythonpath_list)
        return remotepythonpath
Example #34
0
    def CopySource(self,localpythonpath, run_dir_abs):
        
        if self.update_remote_python_parent:
            remote_python_parent = self.remote_python_parent
        else:
            remote_python_parent = run_dir_abs + os.path.sep + "pythonpath"
        util.create_directory_if_necessary(remote_python_parent, isfile=False)
        remotepythonpath_list = []
        for i, localpythonpathdir in enumerate(localpythonpath.split(';')):
            remotepythonpathdir = os.path.join(remote_python_parent, str(i))
            remotepythonpath_list.append(remotepythonpathdir)
            xd_string = HPC.FindDirectoriesToExclude(localpythonpathdir)
            xcopycommand = 'robocopy /s {0} {1}{2}'.format(localpythonpathdir,remotepythonpathdir,xd_string)
            logging.info(xcopycommand)
            os.system(xcopycommand)

        remotepythonpath = ";".join(remotepythonpath_list)
        return remotepythonpath
Example #35
0
    def create_run_dir(self):
        username = os.environ["USERNAME"]
        localwd = os.getcwd()
        if localwd.startswith("\\\\"):
            remotewd = self.fileshare + os.path.sep + username + os.path.sep + "\\".join(localwd.split('\\')[4:])
        else:
            remotewd = self.fileshare + os.path.sep + username + os.path.splitdrive(localwd)[1]  #using '+' because 'os.path.join' isn't work with shares
        remotewd = remotewd.replace("\\","/")
        if remotewd.endswith("/"): # remove trailing /
            remotewd = remotewd[:-1]
        run_dir_rel = os.path.join("runs",util.datestamp(appendrandom=True))
        util.create_directory_if_necessary("runs",isfile=False)
        if not os.path.isfile(".ignoreTgzChange"):
            with open("runs" +  os.path.sep + ".ignoreTgzChange","w") as ignoreFile:
                ignoreFile.write("\n")


        run_dir_abs = "/user/{0}/{1}".format(username,run_dir_rel)
        #!! hadoop_create_directory_if_necessary(run_dir_abs,isfile=False)
        return remotewd, run_dir_abs, run_dir_rel
Example #36
0
    def create_run_dir(self):
        username = os.environ["USERNAME"]
        localwd = os.getcwd()
        if localwd.startswith("\\\\"):
            remotewd = self.fileshare + os.path.sep + username + os.path.sep + "\\".join(localwd.split('\\')[4:])
        else:
            remotewd = self.fileshare + os.path.sep + username + os.path.splitdrive(localwd)[1]  #using '+' because 'os.path.join' isn't work with shares
        remotewd = remotewd.replace("\\","/")
        if remotewd.endswith("/"): # remove trailing /
            remotewd = remotewd[:-1]
        run_dir_rel = os.path.join("runs",util.datestamp(appendrandom=True))
        util.create_directory_if_necessary("runs",isfile=False)
        if not os.path.isfile(".ignoreTgzChange"):
            with open("runs" +  os.path.sep + ".ignoreTgzChange","w") as ignoreFile:
                ignoreFile.write("\n")


        run_dir_abs = "/user/{0}/{1}".format(username,run_dir_rel)
        #!! hadoop_create_directory_if_necessary(run_dir_abs,isfile=False)
        return remotewd, run_dir_abs, run_dir_rel
Example #37
0
def run_one_task(original_distributable, taskindex, taskcount, workdirectory):
    '''
    Does a fraction of the work (e.g. 1 of every 1000 work items) and then saves the results to single file.
    if taskindex == taskcount, does the reduce step
    '''

    if not 0 < taskcount: raise Exception("Expect taskcount to be positive")
    if not (0 <= taskindex and taskindex < taskcount+1) :raise Exception("Expect taskindex to be between 0 (inclusive) and taskcount (exclusive)")

    shaped_distributable = shape_to_desired_workcount(original_distributable, taskcount)

    if shaped_distributable.work_count != taskcount : raise Exception("Assert: expect workcount == taskcount")

    util.create_directory_if_necessary(workdirectory, isfile=False, robust=True)

    if (taskindex < taskcount):
        doMainWorkForOneIndex(shaped_distributable, taskcount, taskindex, workdirectory)
        return None
    else:
        result_sequence = work_sequence_from_disk(workdirectory, taskcount)
        return shaped_distributable.reduce(result_sequence)
Example #38
0
    def submit_to_cluster(self, batfilename_rel, distributable, remotewd, run_dir_abs, run_dir_rel):
        stdout_dir_rel = os.path.join(run_dir_rel,"stdout")
        stdout_dir_abs = os.path.join(run_dir_abs,"stdout")
        util.create_directory_if_necessary(stdout_dir_abs, isfile=False)
        stderr_dir_rel = os.path.join(run_dir_rel,"stderr")
        stderr_dir_abs = os.path.join(run_dir_abs,"stderr")
        util.create_directory_if_necessary(stderr_dir_abs, isfile=False)

        #create the Powershell file
        psfilename_rel = os.path.join(run_dir_rel,"dist.ps1")
        psfilename_abs = os.path.join(run_dir_abs,"dist.ps1")
        util.create_directory_if_necessary(psfilename_abs, isfile=True)
        with open(psfilename_abs, "w") as psfile:
            psfile.write(r"""Add-PsSnapin Microsoft.HPC
        Set-Content Env:CCP_SCHEDULER {0}
        $r = New-HpcJob -Name "{7}" -Priority {8}{12}
        $r.Id
        Add-HpcTask -Name Parametric -JobId $r.Id -Parametric -Start 0 -End {1} -CommandLine "{6} * {5}" -StdOut "{2}\*.txt" -StdErr "{3}\*.txt" -WorkDir {4}
        Add-HpcTask -Name Reduce -JobId $r.Id -Depend Parametric -CommandLine "{6} {5} {5}" -StdOut "{2}\reduce.txt" -StdErr "{3}\reduce.txt" -WorkDir {4}
        Submit-HpcJob -Id $r.Id
        $j = Get-HpcJob -Id $r.Id
        $i = $r.id
        $s = 10

        while(($j.State -ne "Finished") -and ($j.State -ne "Failed") -and ($j.State -ne "Canceled"))
        {10}
            $x = $j.State
            Write-Host "${10}x{11}. Job# ${10}i{11} sleeping for ${10}s{11}"
            Start-Sleep -s $s
            if ($s -ge 60)
            {10}
	        $s = 60
            {11}
            else
            {10}
                $s = $s * 1.1
            {11}
           $j.Refresh()
        {11}

        """                 .format(
                                self.clustername,   #0
                                self.taskcount-1,   #1
                                stdout_dir_rel,     #2
                                stderr_dir_rel,     #3
                                remotewd,           #4
                                self.taskcount,     #5
                                batfilename_rel,    #6
                                self.maxlen(str(distributable),50),      #7
                                self.priority,      #8
                                self.unit,          #9 -- not used anymore,. Instead #12 sets unit
                                "{",                #10
                                "}",                #11
                                self.numString()    #12
                                ))

        import subprocess
        proc = subprocess.Popen(["powershell.exe", "-ExecutionPolicy", "Unrestricted", psfilename_abs], cwd=os.getcwd())
        if not 0 == proc.wait(): raise Exception("Running powershell cluster submit script results in non-zero return code")
Example #39
0
    def create_bat_file(self, distributable, remotepythoninstall, remotepythonpath, remotewd, run_dir_abs, run_dir_rel, result_remote):
        path_share_list = [r"",r"Scripts"]
        remotepath_list = []
        for path_share in path_share_list:
            path_share_abs = os.path.join(remotepythoninstall,path_share)
            if not os.path.isdir(path_share_abs): raise Exception("Expect path directory at '{0}'".format(path_share_abs))
            remotepath_list.append(path_share_abs)
        remotepath = ";".join(remotepath_list)

        distributablep_filename_rel = self.create_distributablep(distributable, run_dir_abs, run_dir_rel)

        distributable_py_file = os.path.join(os.path.dirname(__file__),"..","distributable.py")
        if not os.path.exists(distributable_py_file): raise Exception("Expect file at " + distributable_py_file + ", but it doesn't exist.")
        localfilepath, file = os.path.split(distributable_py_file)
        remoteexepath = os.path.join(remotepythonpath.split(';')[0],"fastlmm","util") #!!shouldn't need to assume where the file is in source
        #run_dir_rel + os.path.sep + "pythonpath" + os.path.sep + os.path.splitdrive(localfilepath)[1]

        result_remote2 = result_remote.encode("string-escape")
        command_string = remoteexepath + os.path.sep + file + " " + distributablep_filename_rel + r""" "LocalInParts(%1,{0},mkl_num_threads={1},result_file=""{2}"") " """.format(self.taskcount,self.mkl_num_threads,result_remote2)
        batfilename_rel = os.path.join(run_dir_rel,"dist.bat")
        batfilename_abs = os.path.join(run_dir_abs,"dist.bat")
        util.create_directory_if_necessary(batfilename_abs, isfile=True)
        matplotlibfilename_rel = os.path.join(run_dir_rel,".matplotlib")
        matplotlibfilename_abs = os.path.join(run_dir_abs,".matplotlib")
        util.create_directory_if_necessary(matplotlibfilename_abs, isfile=False)
        util.create_directory_if_necessary(matplotlibfilename_abs + "/tex.cache", isfile=False)
        ipythondir_rel = os.path.join(run_dir_rel,".ipython")
        ipythondir_abs = os.path.join(run_dir_abs,".ipython")
        util.create_directory_if_necessary(ipythondir_abs, isfile=False)
        with open(batfilename_abs, "w") as batfile:
            batfile.write("set path={0};%path%\n".format(remotepath))
            batfile.write("set PYTHONPATH={0}\n".format(remotepythonpath))
            #batfile.write("set R_HOME={0}\n".format(os.path.join(remotepythoninstall,"R-2.15.2")))
            #batfile.write("set R_USER={0}\n".format(remotewd))
            batfile.write("set USERPROFILE={0}\n".format(run_dir_rel))
            batfile.write("set MPLCONFIGDIR={0}\n".format(matplotlibfilename_rel))
            batfile.write("set IPYTHONDIR={0}\n".format(ipythondir_rel))            
            batfile.write("python {0}\n".format(command_string))

        return batfilename_rel
Example #40
0
    def setUpClass(self):

        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.currentFolder = os.path.dirname(os.path.realpath(__file__))
Example #41
0
    def submit_to_cluster(self, batfilename_rel_list, fileInWorkingDirectoryList, tgzList, tgzListPythonPath, tgzListPythonSettings, distributable, remotewd, run_dir_abs, run_dir_rel):
        logging.info('Hadoop runner is submitting to cluster')

        #!! e.g. hdfs://rr1-n13-02-c02/user/carlk/inputs.tgz#inputs,hdfs://rr1-n13-02-c02/user/carlk/datasets.tgz#datasets,hdfs://rr1-n13-02-c02/user/carlk/src.tgz#src
        #!! could do this functionally
        archivesStringList = []
        for tgz in tgzList:
            archiveString = "hdfs:{0}#{1}".format(tgz[1],os.path.splitext(tgz[0])[0])
            archivesStringList.append(archiveString)
        archivesStringList.append(tgzListPythonSettings)
        for tgz in tgzListPythonPath:
            archivesStringList.append(tgz)

        #e.g. distMapper.bat,distReducer.bat
        filesString = ",".join(batfilename_rel_list+fileInWorkingDirectoryList)

        taskIndexDir = run_dir_rel + os.path.sep + "input"
        util.create_directory_if_necessary(taskIndexDir,isfile=False)

        #zgoal = int(SP.ceil(SP.log(self.taskcount)/SP.log(10)))
        with open(taskIndexDir +  os.path.sep + "taskIndexList.txt","w") as taskIndexListFile:
            for taskIndex in xrange(self.taskcount):
                taskIndexListFile.write("{0}\n".format(taskIndex)) # str(taskIndex).zfill(zgoal)))

        #hadoop fs -rmr runs/2013-08-02_13_51_42
        #hadoop fs -copyFromLocal runs\2013-08-02_13_51_42 runs/2013-08-02_13_51_42
        #hadoop jar %HADOOP_HOME%\lib\hadoop-streaming.jar ^
        #        -archives "hdfs:/user/carlk/source/carlkextranet05312013/ERG01/src/tests/datasets.2013-07-31_11_12_11.tgz#datasets,hdfs:/user/carlk/runs/pythonpath.0.src.2013-07-31_14_30_56/src.tgz#pythonpath.0.src" ^
        #        -files "hdfs:/user/carlk/runs/2013-08-02_13_51_42/distMapper.bat,hdfs:/user/carlk/runs/2013-08-02_13_51_42/distReducer.bat,hdfs:/user/carlk/runs/2013-08-02_13_51_42/distributable.p" ^
        #        -input "runs/2013-08-02_13_51_42/input" ^
        #        -output "runs/2013-08-02_13_51_42/output" ^
        #        -mapper "distMapper.bat" ^
        #       -reducer "distReducer.bat"
        #hadoop fs -cat runs/2013-08-02_13_51_42/output/part-00000  | more
        s00 = r"%HADOOP_HOME%\bin\hadoop fs -rmr -skipTrash {0}".format(run_dir_rel.replace("\\","/"))
        s0 = r"%HADOOP_HOME%\bin\hadoop fs -copyFromLocal {0} {1}".format(run_dir_rel, run_dir_rel.replace("\\","/"))


        #-D mapreduce.reduce.shuffle.connect.timeout=3600000 ^
        #-D io.sort.mb=1400 ^
        #-D job.end.retry.interval=3600000 ^
        #-D mapred.tasktracker.expiry.interval=3600000 ^

        logging.info("running {0}".format(str(distributable)))
        

        s = r"""%HADOOP_HOME%\bin\hadoop jar %HADOOP_HOME%\lib\hadoop-streaming.jar ^
        -archives "{0}" ^
        -files "{1}" ^
        -D mapred.job.name="{8}" ^
        -D mapred.map.tasks={4} ^
        -D mapred.reduce.tasks=1 ^
        -D mapred.job.map.memory.mb={5} ^
        -D mapred.job.reduce.memory.mb={6} ^
        -D mapred.task.timeout={7} ^
        -D mapred.job.queue.name="{9}" ^
        -input {2} ^
        -output {3} ^
        -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat ^
        -mapper "distMapper.bat" ^
        -reducer "distReducer.bat"
            """.format(
                    ",".join(archivesStringList),       #0
                    filesString,                        #1
                    taskIndexDir.replace("\\","/"),     #2
                    (run_dir_rel + os.path.sep + "output").replace("\\","/"), #3
                    self.taskcount,                     #4
                    self.mapmemory,                     #5
                    self.reducememory,                  #6
                    0,                                  #7
                    str(distributable),                 #8
                    self.queue                         #9
                    )
        runHadoopFileName = run_dir_rel + os.path.sep + "runHadoop.bat"
        logging.info("Hadoop runner is creating '{0}'".format(runHadoopFileName))
        with open(runHadoopFileName, "w") as runHadoopFile:
            runHadoopFile.write("call {0}\n".format(s00))
            runHadoopFile.write("call {0}\n".format(s0))
            runHadoopFile.write("call {0}\n".format(s))

        sOneLine = "".join(s.split("^\n"))

        
        logging.info("Hadoop runner running the copyFromLocal")
        with TemporaryFile() as output:
            stdout0 = subprocess.check_output(s0,stderr=output,shell=True)
            output.seek(0)
            stderr0 = output.read()
        logging.info("Result from 'Hadoop runner running the copyFromLocal' is stdout='{0}', stderr='{1}'".format(stdout0, stderr0))
        if stderr0 != "" : raise Exception("Stderr from command: '{0}'".format(stderr0))
        logging.info("Hadoop runner running the streamingjar")

        with TemporaryFile() as output:
            stdout = subprocess.check_output(sOneLine,stderr=output,shell=True)
            output.seek(0)
            stderr = output.read()
        logging.info("Result from 'Hadoop runner running the streamingjar' is stdout='{0}', stderr='{1}'".format(stdout, stderr))
        logging.info('Done: Hadoop runner is submitting to cluster')
Example #42
0
    def create_bat_file(self, distributable, remotepythoninstall, remotepythonpath, remotewd, run_dir_abs, run_dir_rel, result_remote, nodelocalwd, create_bat_file):
        path_share_list = [r"",r"Scripts"]
        remotepath_list = []
        for path_share in path_share_list:
            path_share_abs = os.path.join(remotepythoninstall,path_share)
            if not os.path.isdir(path_share_abs): raise Exception("Expect path directory at '{0}'".format(path_share_abs))
            remotepath_list.append(path_share_abs)
        remotepath = ";".join(remotepath_list)

        distributablep_filename_rel, distributablep_filename_abs = self.create_distributablep(distributable, run_dir_abs, run_dir_rel)

        distributable_py_file = os.path.join(os.path.dirname(__file__),"..","distributable.py")
        if not os.path.exists(distributable_py_file): raise Exception("Expect file at " + distributable_py_file + ", but it doesn't exist.")
        localfilepath, file = os.path.split(distributable_py_file)

        for remote_path_part in remotepythonpath.split(';'):
            remoteexe = os.path.join(remote_path_part,"fastlmm","util",file)
            if os.path.exists(remoteexe):
                break #not continue
            remoteexe = None
        assert remoteexe is not None, "Could not find '{0}' on remote python path. Is fastlmm on your local python path?".format(file)

        #run_dir_rel + os.path.sep + "pythonpath" + os.path.sep + os.path.splitdrive(localfilepath)[1]

        #result_remote2 = result_remote.encode("string-escape")
        command_string = remoteexe + r""" "{0}" """.format(distributablep_filename_abs) + r""" "LocalInParts(%1,{0},mkl_num_threads={1},result_file=""{2}"",run_dir=""{3}"") " """.format(
            self.taskcount,
            self.mkl_num_threads,
            "result.p",
            run_dir_abs.encode("string-escape"))
        batfilename_rel = os.path.join(run_dir_rel,"dist.bat")
        batfilename_abs = os.path.join(run_dir_abs,"dist.bat")
        util.create_directory_if_necessary(batfilename_abs, isfile=True)
        matplotlibfilename_rel = os.path.join(run_dir_rel,".matplotlib")
        matplotlibfilename_abs = os.path.join(run_dir_abs,".matplotlib")
        util.create_directory_if_necessary(matplotlibfilename_abs, isfile=False)
        util.create_directory_if_necessary(matplotlibfilename_abs + "/tex.cache", isfile=False)
        ipythondir_rel = os.path.join(run_dir_rel,".ipython")
        ipythondir_abs = os.path.join(run_dir_abs,".ipython")
        util.create_directory_if_necessary(ipythondir_abs, isfile=False)
        with open(batfilename_abs, "w") as batfile:
            batfile.write("set path={0};%path%\n".format(remotepath))
            batfile.write("set PYTHONPATH={0}\n".format(remotepythonpath))
            batfile.write("set USERPROFILE={0}\n".format(run_dir_abs))
            batfile.write("set MPLCONFIGDIR={0}\n".format(matplotlibfilename_abs))
            batfile.write("set IPYTHONDIR={0}\n".format(ipythondir_abs))
            batfile.write("python {0}\n".format(command_string))

        if (self.node_local):
            with open( os.path.join(run_dir_abs,"nodeprep.bat"), "w") as prepfile:
                prepfile.write(r"""set f="{0}"{1}""".format(remotewd,'\n'))
                prepfile.write(r"""set t="{0}"{1}""".format(nodelocalwd,'\n'))
                prepfile.write("if not exist %t% mkdir %t%\n")
                with open( os.path.join(run_dir_abs,"noderelease.bat"), "w") as releasefile:
                    releasefile.write(r"""set f="{0}"{1}""".format(remotewd,'\n'))
                    releasefile.write(r"""set t="{0}"{1}""".format(nodelocalwd,'\n'))
                    inputOutputCopier = HPCCopierNodeLocal(prepfile,releasefile,self.clean_up) #Create the object that copies input and output files to where they are needed
                    inputOutputCopier.input(distributable) # copy of the input files to where they are needed (i.e. to the cluster)
                    inputOutputCopier.output(distributable) # copy of the output files to where they are needed (i.e. off the cluster)
                    releasefile.write("rmdir /s %t%\n")
                    releasefile.write("exit /b 0\n")


        return batfilename_rel
Example #43
0
    def submit_to_cluster(self, batfilename_rel, distributable, remotewd, run_dir_abs, run_dir_rel, nodelocalwd):
        stdout_dir_rel = os.path.join(run_dir_rel,"stdout")
        stdout_dir_abs = os.path.join(run_dir_abs,"stdout")
        util.create_directory_if_necessary(stdout_dir_abs, isfile=False)
        stderr_dir_rel = os.path.join(run_dir_rel,"stderr")
        stderr_dir_abs = os.path.join(run_dir_abs,"stderr")
        util.create_directory_if_necessary(stderr_dir_abs, isfile=False)
        
        if len(self.excluded_nodes) > 0:
            excluded_nodes = "Set-HpcJob -Id $r.Id -addExcludedNodes {0}".format(", ".join(self.excluded_nodes))
        else:
            excluded_nodes = ""


        #create the Powershell file
        psfilename_rel = os.path.join(run_dir_rel,"dist.ps1")
        psfilename_abs = os.path.join(run_dir_abs,"dist.ps1")
        util.create_directory_if_necessary(psfilename_abs, isfile=True)
        with open(psfilename_abs, "w") as psfile:
            psfile.write(r"""Add-PsSnapin Microsoft.HPC
        Set-Content Env:CCP_SCHEDULER {0}
        $r = New-HpcJob -Name "{7}" -Priority {8}{12}{14}{16} -RunTime {15} -FailOnTaskFailure {23} #-Preemptable {22}
        $r.Id
        if ({20})
        {10}
            $from = "{4}"
            $to = "{17}"
            Add-HpcTask -Name NodePrep    -JobId $r.Id -Type NodePrep                -CommandLine "${{from}}\{18}"        -StdOut "${{from}}\{2}\nodeprep.txt"    -StdErr "${{from}}\{3}\nodeprep.txt"    -WorkDir .
            Add-HpcTask -Name Parametric  -JobId $r.Id -Parametric -Start 0 -End {1} -CommandLine "${{from}}\{6} * {5}"   -StdOut "${{from}}\{2}\*.txt"    -StdErr "${{from}}\{3}\*.txt"                  -WorkDir $to
            Add-HpcTask -Name Reduce      -JobId $r.Id -Depend Parametric            -CommandLine "${{from}}\{6} {5} {5}" -StdOut "${{from}}\{2}\reduce.txt"      -StdErr "${{from}}\{3}\reduce.txt"      -WorkDir $to
            {21}Add-HpcTask -Name NodeRelease -JobId $r.Id -Type NodeRelease         -CommandLine "${{from}}\{19}"        -StdOut "${{from}}\{2}\noderelease.txt" -StdErr "${{from}}\{3}\noderelease.txt" -WorkDir .
        {11}
        else
        {10}
            Add-HpcTask -Name Parametric -JobId $r.Id -Parametric -Start 0 -End {1} -CommandLine "{6} * {5}" -StdOut "{2}\*.txt" -StdErr "{3}\*.txt" -WorkDir {4}
            Add-HpcTask -Name Reduce -JobId $r.Id -Depend Parametric -CommandLine "{6} {5} {5}" -StdOut "{2}\reduce.txt" -StdErr "{3}\reduce.txt" -WorkDir {4}
        {11}

        {13}
        Submit-HpcJob -Id $r.Id
        $j = Get-HpcJob -Id $r.Id
        $i = $r.id
        $s = 10

        while(($j.State -ne "Finished") -and ($j.State -ne "Failed") -and ($j.State -ne "Canceled"))
        {10}
            $x = $j.State
            Write-Host "${10}x{11}. Job# ${10}i{11} sleeping for ${10}s{11}"
            Start-Sleep -s $s
            if ($s -ge 60)
            {10}
            $s = 60
            {11}
            else
            {10}
                $s = $s * 1.1
            {11}
           $j.Refresh()
        {11}

        """                 .format(
                                self.clustername,   #0
                                self.taskcount-1,   #1
                                stdout_dir_rel,     #2
                                stderr_dir_rel,     #3
                                remotewd,           #4 fileshare wd
                                self.taskcount,     #5
                                batfilename_rel,    #6
                                self.maxlen(str(distributable),50),      #7
                                self.priority,      #8
                                self.unit,          #9 -- not used anymore,. Instead #12 sets unit
                                "{",                #10
                                "}",                #11
                                self.numString(),   #12
                                excluded_nodes,     #13
                                ' -templateName "{0}"'.format(self.template) if self.template is not None else "", #14
                                self.runtime,       #15 RuntimeSeconds
                                ' -NodeGroups "{0}"'.format(self.nodegroups) if self.nodegroups is not None else "", #16
                                nodelocalwd,        #17 the node-local wd
                                batfilename_rel[0:-8]+"nodeprep.bat", #18
                                batfilename_rel[0:-8]+"noderelease.bat", #19
                                1 if self.node_local else 0,             #20
                                "",                                      #21 always run release task
                                self.preemptable,                        #22
                                '$true' if self.FailOnTaskFailure else '$false',   #23
                                ))
        assert batfilename_rel[-8:] == "dist.bat", "real assert"
        import subprocess
        proc = subprocess.Popen(["powershell.exe", "-ExecutionPolicy", "Unrestricted", psfilename_abs], cwd=os.getcwd())
        if not 0 == proc.wait(): raise Exception("Running powershell cluster submit script results in non-zero return code")
Example #44
0
    def reduce(self, result_sequence):
        """

        TODO: finish docstring
        return
        
        """
        #self.feature_selection_strategy.run_once() #Don't need and save time

        ##########################################
        ## perform model selection
        ##########################################

        mse_cv = [
        ]  #np.zeros((self.feature_selection_strategy.num_folds, len(self.k_values), len(self.delta_values)))
        ll_cv = [
        ]  # np.zeros((self.feature_selection_strategy.num_folds, len(self.k_values), len(self.delta_values)))
        best_delta_for_k = [
        ]  # np.zeros((self.feature_selection_strategy.num_folds, len(self.k_values)))
        lingreg_results = None
        for result in result_sequence:
            if len(
                    result
            ) == 2:  # the lingreg_results look different than the regular results because they are length two
                if lingreg_results != None: raise Exception("assert")
                lingreg_results = result
                continue
            fold_idx, mse_cv1, ll_cv1, best_delta_for_k_1 = result
            mse_cv.append(mse_cv1)
            ll_cv.append(ll_cv1)
            best_delta_for_k.append(best_delta_for_k_1)
        if lingreg_results == None: raise Exception("assert")
        if (self.strategy == "insample_cv") or (self.strategy
                                                == "lmm_full_cv"):
            if len(ll_cv) != len(mse_cv) or len(
                    mse_cv) != self.feature_selection_strategy.num_folds:
                raise Exception(
                    "These should be the same. len(ll_cv)={0}, len(mse_cv)={1}, self.feature_selection_strategy.num_folds={2}"
                    .format(len(ll_cv), len(mse_cv),
                            self.feature_selection_strategy.num_folds))
        else:
            assert len(best_delta_for_k) == len(
                mse_cv) == self.feature_selection_strategy.num_folds

        # find best parameters
        if mse_cv is not None:
            best_k_mse, best_delta_mse, best_mse, best_delta_mse_interp, best_mse_interp = self.feature_selection_strategy.reduce_result(
                mse_cv,
                self.k_values,
                self.delta_values,
                self.strategy,
                self.output_prefix,
                best_delta_for_k,
                label="mse",
                create_pdf=self.create_pdf)
        if ll_cv is not None:
            best_k_ll, best_delta_ll, best_ll, best_delta_ll_interp, best_ll_interp = self.feature_selection_strategy.reduce_result(
                ll_cv,
                self.k_values,
                self.delta_values,
                self.strategy,
                self.output_prefix,
                best_delta_for_k,
                label="ll",
                create_pdf=self.create_pdf)

        if self.select_by_ll:
            best_k, best_delta, best_obj, best_delta_interp, best_obj_interp = best_k_ll, best_delta_ll, best_ll, best_delta_ll_interp, best_ll_interp
        else:
            best_k, best_delta, best_obj, best_delta_interp, best_obj_interp = best_k_mse, best_delta_mse, best_mse, best_delta_mse_interp, best_mse_interp

        # perform final scan on whole data set
        best_snps, sorted_pval = self.feature_selection_strategy.final_scan(
            best_k, lingreg_results)

        # write report file
        if self.output_prefix != None:

            report = "k_grid: " + str([k for k in self.k_values]) + "\n"
            ln_delta_grid = np.array([sp.log(x) for x in self.delta_values])
            report += "ln_delta_grid: " + str(ln_delta_grid.tolist()) + "\n"
            report += "best k=%i\nbest ln_delta=%.1e\nbest objective=%.2f" % (
                best_k, sp.log(best_delta), best_obj)
            if self.feature_selection_strategy.interpolate_delta and best_delta_interp is not None:
                report += "\nbest ln_delta_interp=%.1e\nbest objective_interp=%.2f" % (
                    sp.log(best_delta_interp), best_obj_interp)

            report_fn = self.output_prefix + "_report.txt"
            util.create_directory_if_necessary(report_fn)
            report_file = open(report_fn, "w")
            report_file.write(report)
            report_file.close()

            # write out SNPs to keep
            #df = pd.DataFrame({"snp_id": best_snps.index, "snp_rs": best_snps.values, "p_values": sorted_pval})
            df = pd.DataFrame({"snp_rs":
                               best_snps.values})  #Change snp_rs to sid?
            df.to_csv(self.output_prefix + "_snp.csv",
                      index=False,
                      header=False)

        return best_k, best_delta, best_obj, best_snps
    def reduce(self, result_sequence):
        """

        TODO: finish docstring
        return
        
        """
        #self.feature_selection_strategy.run_once() #Don't need and save time

        ##########################################
        ## perform model selection
        ##########################################

        mse_cv = [] #np.zeros((self.feature_selection_strategy.num_folds, len(self.k_values), len(self.delta_values)))
        ll_cv = [] # np.zeros((self.feature_selection_strategy.num_folds, len(self.k_values), len(self.delta_values)))
        best_delta_for_k = [] # np.zeros((self.feature_selection_strategy.num_folds, len(self.k_values)))
        lingreg_results = None
        for result in result_sequence:
            if len(result) == 2: # the lingreg_results look different than the regular results because they are length two
                if lingreg_results != None : raise Exception("assert")
                lingreg_results = result
                continue
            fold_idx, mse_cv1, ll_cv1, best_delta_for_k_1 = result
            mse_cv.append(mse_cv1)
            ll_cv.append(ll_cv1)
            best_delta_for_k.append(best_delta_for_k_1)
        if lingreg_results == None : raise Exception("assert")
        if (self.strategy=="insample_cv") or (self.strategy=="lmm_full_cv"):
            if len(ll_cv) != len(mse_cv) or len(mse_cv) != self.feature_selection_strategy.num_folds:
                raise Exception("These should be the same. len(ll_cv)={0}, len(mse_cv)={1}, self.feature_selection_strategy.num_folds={2}".format(len(ll_cv), len(mse_cv), self.feature_selection_strategy.num_folds))
        else:
            assert len(best_delta_for_k) == len(mse_cv) == self.feature_selection_strategy.num_folds


        # find best parameters
        if mse_cv is not None:
            best_k_mse, best_delta_mse, best_mse, best_delta_mse_interp, best_mse_interp = self.feature_selection_strategy.reduce_result(mse_cv, self.k_values, self.delta_values, self.strategy, self.output_prefix, best_delta_for_k, label="mse")
        if ll_cv is not None:
            best_k_ll, best_delta_ll, best_ll, best_delta_ll_interp, best_ll_interp = self.feature_selection_strategy.reduce_result(ll_cv, self.k_values, self.delta_values, self.strategy, self.output_prefix, best_delta_for_k, label="ll")

        if self.select_by_ll:
            best_k, best_delta, best_obj, best_delta_interp, best_obj_interp = best_k_ll, best_delta_ll, best_ll, best_delta_ll_interp, best_ll_interp
        else:
            best_k, best_delta, best_obj, best_delta_interp, best_obj_interp = best_k_mse, best_delta_mse, best_mse, best_delta_mse_interp, best_mse_interp

        
        # perform final scan on whole data set
        best_snps, sorted_pval = self.feature_selection_strategy.final_scan(best_k, lingreg_results)

        # write report file
        if self.output_prefix != None:

            report = "k_grid: " + str([k for k in self.k_values]) + "\n"
            ln_delta_grid = np.array([sp.log(x) for x in self.delta_values])
            report += "ln_delta_grid: " + str(ln_delta_grid.tolist()) + "\n"
            report += "best k=%i\nbest ln_delta=%.1e\nbest objective=%.2f" % (best_k, sp.log(best_delta), best_obj)
            if self.feature_selection_strategy.interpolate_delta and best_delta_interp is not None:
                report += "\nbest ln_delta_interp=%.1e\nbest objective_interp=%.2f" % (sp.log(best_delta_interp), best_obj_interp)
            
            report_fn = self.output_prefix + "_report.txt"
            util.create_directory_if_necessary(report_fn)
            report_file = open(report_fn, "w")
            report_file.write(report)
            report_file.close()
            
            # write out SNPs to keep
            #df = pd.DataFrame({"snp_id": best_snps.index, "snp_rs": best_snps.values, "p_values": sorted_pval})
            df = pd.DataFrame({ "snp_rs": best_snps.values}) #Change snp_rs to sid?
            df.to_csv(self.output_prefix + "_snp.csv", index=False, header=False)

        return best_k, best_delta, best_obj, best_snps
Example #46
0
    def create_bat_file(self, distributable, remotepythoninstall, remotepythonpath, remotewd, run_dir_abs, run_dir_rel, result_remote, nodelocalwd, create_bat_file):
        path_share_list = [r"",r"Scripts"]
        remotepath_list = []
        for path_share in path_share_list:
            path_share_abs = os.path.join(remotepythoninstall,path_share)
            if not os.path.isdir(path_share_abs): raise Exception("Expect path directory at '{0}'".format(path_share_abs))
            remotepath_list.append(path_share_abs)
        remotepath = ";".join(remotepath_list)

        distributablep_filename_rel, distributablep_filename_abs = self.create_distributablep(distributable, run_dir_abs, run_dir_rel)

        distributable_py_file = os.path.join(os.path.dirname(__file__),"..","distributable.py")
        if not os.path.exists(distributable_py_file): raise Exception("Expect file at " + distributable_py_file + ", but it doesn't exist.")
        localfilepath, file = os.path.split(distributable_py_file)

        for remote_path_part in remotepythonpath.split(';'):
            remoteexe = os.path.join(remote_path_part,"fastlmm","util",file)
            if os.path.exists(remoteexe):
                break #not continue
            remoteexe = None
        assert remoteexe is not None, "Could not find '{0}' on remote python path. Is fastlmm on your local python path?".format(file)

        #run_dir_rel + os.path.sep + "pythonpath" + os.path.sep + os.path.splitdrive(localfilepath)[1]

        #result_remote2 = result_remote.encode("string-escape")
        command_string = remoteexe + r""" "{0}" """.format(distributablep_filename_abs) + r""" "LocalInParts(%1,{0},mkl_num_threads={1},result_file=""{2}"",run_dir=""{3}"") " """.format(
            self.taskcount,
            self.mkl_num_threads,
            "result.p",
            run_dir_abs.encode("string-escape"))
        batfilename_rel = os.path.join(run_dir_rel,"dist.bat")
        batfilename_abs = os.path.join(run_dir_abs,"dist.bat")
        util.create_directory_if_necessary(batfilename_abs, isfile=True)
        matplotlibfilename_rel = os.path.join(run_dir_rel,".matplotlib")
        matplotlibfilename_abs = os.path.join(run_dir_abs,".matplotlib")
        util.create_directory_if_necessary(matplotlibfilename_abs, isfile=False)
        util.create_directory_if_necessary(matplotlibfilename_abs + "/tex.cache", isfile=False)
        ipythondir_rel = os.path.join(run_dir_rel,".ipython")
        ipythondir_abs = os.path.join(run_dir_abs,".ipython")
        util.create_directory_if_necessary(ipythondir_abs, isfile=False)
        with open(batfilename_abs, "w") as batfile:
            batfile.write("set path={0};%path%\n".format(remotepath))
            batfile.write("set PYTHONPATH={0}\n".format(remotepythonpath))
            batfile.write("set USERPROFILE={0}\n".format(run_dir_abs))
            batfile.write("set MPLCONFIGDIR={0}\n".format(matplotlibfilename_abs))
            batfile.write("set IPYTHONDIR={0}\n".format(ipythondir_abs))
            batfile.write("python {0}\n".format(command_string))

        if (self.node_local):
            with open( os.path.join(run_dir_abs,"nodeprep.bat"), "w") as prepfile:
                prepfile.write(r"""set f="{0}"{1}""".format(remotewd,'\n'))
                prepfile.write(r"""set t="{0}"{1}""".format(nodelocalwd,'\n'))
                prepfile.write("if not exist %t% mkdir %t%\n")
                with open( os.path.join(run_dir_abs,"noderelease.bat"), "w") as releasefile:
                    releasefile.write(r"""set f="{0}"{1}""".format(remotewd,'\n'))
                    releasefile.write(r"""set t="{0}"{1}""".format(nodelocalwd,'\n'))
                    inputOutputCopier = HPCCopierNodeLocal(prepfile,releasefile,self.clean_up) #Create the object that copies input and output files to where they are needed
                    inputOutputCopier.input(distributable) # copy of the input files to where they are needed (i.e. to the cluster)
                    inputOutputCopier.output(distributable) # copy of the output files to where they are needed (i.e. off the cluster)
                    releasefile.write("rmdir /s %t%\n")
                    releasefile.write("exit /b 0\n")


        return batfilename_rel
Example #47
0
    def setUpClass(self):

        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.currentFolder = os.path.dirname(os.path.realpath(__file__))
Example #48
0
    def reduce_result(self,
                      loss_cv,
                      k_values,
                      delta_values,
                      strategy,
                      output_prefix,
                      best_delta_for_k,
                      label="mse",
                      create_pdf=True):
        """
        turn cross-validation results into results
        """
        #self.run_once() #Don't need and saves time

        # average over splits
        average_loss = np.array(loss_cv).mean(axis=0)
        #average_loss = np.vstack(loss_cv).mean(axis=0)
        best_ln_delta_interp, best_obj_interp, best_delta_interp = (None, None,
                                                                    None)

        # reconstruct results
        if strategy == "lmm_full_cv":
            # save cv scores
            if output_prefix != None:
                split_idx = ["mean"] * len(k_values)
                for idx in range(len(loss_cv)):
                    split_idx.extend([idx] * loss_cv[idx].shape[0])

                stacked_result = np.vstack(loss_cv)
                stacked_result = np.vstack((average_loss, stacked_result))

                out_fn = output_prefix + "_" + label + ".csv"
                cols = pd.MultiIndex.from_arrays(
                    [split_idx, k_values * (self.num_folds + 1)],
                    names=['split_id', 'k_value'])
                df = pd.DataFrame(stacked_result,
                                  columns=delta_values,
                                  index=cols)
                util.create_directory_if_necessary(out_fn)
                df.to_csv(out_fn)

            # make sure delta is not at the boundary for any k
            assert average_loss.shape[0] == len(k_values)
            for k_idx in range(average_loss.shape[0]):
                tmp_idx = np.argmin(average_loss[k_idx])

                if tmp_idx == 0 or tmp_idx == len(delta_values) - 1:
                    logging.warn(
                        "(select by %s): ln_delta for k=%i is at the boundary (idx=%i) of defined delta grid"
                        % (label, k_values[k_idx], tmp_idx))

            best_k_idx, best_delta_idx = np.unravel_index(
                average_loss.argmin(), average_loss.shape)
            best_k, best_delta = k_values[best_k_idx], delta_values[
                best_delta_idx]
            best_obj = average_loss[best_k_idx, best_delta_idx]
            best_ln_delta = np.log(best_delta)
            best_str = "best: k=%i, ln_d=%.1f, obj=%.2f" % (
                best_k, best_ln_delta, best_obj)

            # fit parabola to 3 points in logspace
            if self.interpolate_delta:
                if best_delta_idx != 0 and best_delta_idx != len(
                        delta_values) - 1:
                    log_deltas = [
                        np.log(d) for d in delta_values[best_delta_idx -
                                                        1:best_delta_idx + 2]
                    ]
                    error_3pt = average_loss[best_k_idx, best_delta_idx -
                                             1:best_delta_idx + 2]

                    best_ln_delta_interp, best_obj_interp = self.fit_parabola(
                        log_deltas, error_3pt, output_prefix=None)
                    best_delta_interp = sp.exp(best_ln_delta_interp)
                    best_str += ", ln_d_interp=%.2f" % (best_ln_delta_interp)
                    logging.info("best interpolated ln_delta {0}".format(
                        best_ln_delta_interp))
                else:
                    logging.warn(
                        "(select by %s): best ln_delta for all k is at the boundary (idx=%i) of search grid, please consider a larger grid"
                        % (label, best_delta_idx))
                    #if output_prefix != None:
                    #create a size-zero file so that the cluster will aways have something to copy
                    #plot_fn=output_prefix+"_parabola.pdf"
                    #util.create_directory_if_necessary(plot_fn)
                    #open(plot_fn, "w").close()

            # save cv scores
            if create_pdf and (output_prefix != None):
                # visualize results
                import matplotlib
                matplotlib.use(
                    'Agg'
                )  #This lets it work even on machines without graphics displays
                import pylab
                pylab.figure()
                ax = pylab.subplot(111)
                try:
                    for delta_idx, delta in enumerate(delta_values):
                        ln_delta = sp.log(delta)
                        ax.semilogx(k_values,
                                    average_loss[:, delta_idx],
                                    "-x",
                                    label="ln_d=%.1f" % (ln_delta))

                    # Shrink current axis by 20%
                    box = ax.get_position()
                    ax.set_position(
                        [box.x0, box.y0, box.width * 0.8, box.height])

                    #TODO: this assumes the k_values are sorted:
                    pylab.ylim(ymax=average_loss[0].max() +
                               abs(average_loss[0].max()) * 0.05)
                    if k_values[0] != 0:
                        logging.warn("Expect the first k value to be zero"
                                     )  #!!move this change earlier
                    for i in range(len(k_values)):
                        if k_values[i] == 0:
                            ax.axhline(average_loss[i].max(), color='green')
                            mymin = average_loss.min()
                            mymax = average_loss[i].max()
                            diff = (mymax - mymin) * 0.05
                            pylab.ylim([mymin - diff, mymax + diff])

                    # Put a legend to the right of the current axis
                    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

                    pylab.title(best_str)
                    pylab.ylabel(label)
                    pylab.xlabel("k")
                    pylab.grid(True)
                    #pylab.show()
                except:
                    pass
                xval_fn = output_prefix + "_xval_%s.pdf" % label
                util.create_directory_if_necessary(xval_fn)
                pylab.savefig(xval_fn)
        elif strategy == "insample_cv":
            best_k_idx = average_loss.argmin()
            best_k = k_values[best_k_idx]
            best_obj = average_loss[best_k_idx]

            # check if unique over folds
            delta_array = np.array(best_delta_for_k)
            unique_deltas_for_k = set(delta_array[:, best_k_idx])
            if len(unique_deltas_for_k) > 1:
                logging.warn("ambiguous choice of delta for k: {0} {1}".format(
                    best_k, unique_deltas_for_k))

            best_delta = np.median(delta_array[:, best_k_idx])

            best_str = "best k=%i, best delta=%.2f" % (best_k, best_delta)
            logging.info(best_str)
            if output_prefix != None:
                split_idx = ["mean"] * len(k_values)
                for idx in range(len(loss_cv)):
                    split_idx.extend([idx] * loss_cv[idx].shape[0])

                stacked_result = np.vstack(loss_cv)
                stacked_result = np.vstack((average_loss, stacked_result))
                out_fn = output_prefix + "_" + label + ".csv"
                cols = pd.MultiIndex.from_arrays(
                    [split_idx, k_values * (self.num_folds + 1)],
                    names=['split_id', 'k_value'])
                print(
                    "Christoph: bug, this is a quick fix that runs but may write out wrong results"
                )
                df = pd.DataFrame(stacked_result.flatten()[:, None],
                                  columns=[label],
                                  index=cols)
                util.create_directory_if_necessary(out_fn)
                df.to_csv(out_fn)
            if create_pdf and (output_prefix != None):
                # visualize results
                import matplotlib
                matplotlib.use(
                    'Agg'
                )  #This lets it work even on machines without graphics displays
                import pylab
                pylab.figure()
                ax = pylab.subplot(111)
                try:
                    ax.semilogx(k_values, average_loss, "-x", label="loo")

                    # shrink current axis by 20%
                    box = ax.get_position()
                    #TODO: this assumes the k_values are sorted:
                    ax.set_position(
                        [box.x0, box.y0, box.width * 0.8, box.height])
                    pylab.ylim(ymax=average_loss[0].max() +
                               abs(average_loss[0].max()) * 0.05)
                    if k_values[0] != 0:
                        logging.warn("Expect the first k value to be zero"
                                     )  #!!move this change earlier
                    for i in range(len(k_values)):
                        if k_values[i] == 0:
                            ax.axhline(average_loss[i].max(), color='green')
                            mymin = average_loss.min()
                            mymax = average_loss[i].max()
                            diff = (mymax - mymin) * 0.05
                            pylab.ylim([mymin - diff, mymax + diff])
                    # Put a legend to the right of the current axis
                    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
                    pylab.title(best_str)
                    pylab.ylabel(label)
                    pylab.xlabel("k")
                    pylab.grid(True)
                except:
                    pass
                plot_fn = output_prefix + "_xval_%s.pdf" % label
                util.create_directory_if_necessary(plot_fn)
                pylab.savefig(plot_fn)
        else:
            raise NotImplementedError(strategy)
        return best_k, best_delta, best_obj, best_delta_interp, best_obj_interp
Example #49
0
    def submit_to_cluster(self, batfilename_rel, distributable, remotewd, run_dir_abs, run_dir_rel, nodelocalwd):
        stdout_dir_rel = os.path.join(run_dir_rel,"stdout")
        stdout_dir_abs = os.path.join(run_dir_abs,"stdout")
        util.create_directory_if_necessary(stdout_dir_abs, isfile=False)
        stderr_dir_rel = os.path.join(run_dir_rel,"stderr")
        stderr_dir_abs = os.path.join(run_dir_abs,"stderr")
        util.create_directory_if_necessary(stderr_dir_abs, isfile=False)
        
        if len(self.excluded_nodes) > 0:
            excluded_nodes = "Set-HpcJob -Id $r.Id -addExcludedNodes {0}".format(", ".join(self.excluded_nodes))
        else:
            excluded_nodes = ""


        #create the Powershell file
        psfilename_rel = os.path.join(run_dir_rel,"dist.ps1")
        psfilename_abs = os.path.join(run_dir_abs,"dist.ps1")
        util.create_directory_if_necessary(psfilename_abs, isfile=True)
        with open(psfilename_abs, "w") as psfile:
            psfile.write(r"""Add-PsSnapin Microsoft.HPC
        Set-Content Env:CCP_SCHEDULER {0}
        $r = New-HpcJob -Name "{7}" -Priority {8}{12}{14}{16} -RunTime {15} -FailOnTaskFailure {23} #-Preemptable {22}
        $r.Id
        if ({20})
        {10}
            $from = "{4}"
            $to = "{17}"
            Add-HpcTask -Name NodePrep    -JobId $r.Id -Type NodePrep                -CommandLine "${{from}}\{18}"        -StdOut "${{from}}\{2}\nodeprep.txt"    -StdErr "${{from}}\{3}\nodeprep.txt"    -WorkDir .
            Add-HpcTask -Name Parametric  -JobId $r.Id -Parametric -Start 0 -End {1} -CommandLine "${{from}}\{6} * {5}"   -StdOut "${{from}}\{2}\*.txt"    -StdErr "${{from}}\{3}\*.txt"                  -WorkDir $to
            Add-HpcTask -Name Reduce      -JobId $r.Id -Depend Parametric            -CommandLine "${{from}}\{6} {5} {5}" -StdOut "${{from}}\{2}\reduce.txt"      -StdErr "${{from}}\{3}\reduce.txt"      -WorkDir $to
            {21}Add-HpcTask -Name NodeRelease -JobId $r.Id -Type NodeRelease         -CommandLine "${{from}}\{19}"        -StdOut "${{from}}\{2}\noderelease.txt" -StdErr "${{from}}\{3}\noderelease.txt" -WorkDir .
        {11}
        else
        {10}
            Add-HpcTask -Name Parametric -JobId $r.Id -Parametric -Start 0 -End {1} -CommandLine "{6} * {5}" -StdOut "{2}\*.txt" -StdErr "{3}\*.txt" -WorkDir {4}
            Add-HpcTask -Name Reduce -JobId $r.Id -Depend Parametric -CommandLine "{6} {5} {5}" -StdOut "{2}\reduce.txt" -StdErr "{3}\reduce.txt" -WorkDir {4}
        {11}

        {13}
        Submit-HpcJob -Id $r.Id
        $j = Get-HpcJob -Id $r.Id
        $i = $r.id
        $s = 10

        while(($j.State -ne "Finished") -and ($j.State -ne "Failed") -and ($j.State -ne "Canceled"))
        {10}
            $x = $j.State
            Write-Host "${10}x{11}. Job# ${10}i{11} sleeping for ${10}s{11}"
            Start-Sleep -s $s
            if ($s -ge 60)
            {10}
            $s = 60
            {11}
            else
            {10}
                $s = $s * 1.1
            {11}
           $j.Refresh()
        {11}

        """                 .format(
                                self.clustername,   #0
                                self.taskcount-1,   #1
                                stdout_dir_rel,     #2
                                stderr_dir_rel,     #3
                                remotewd,           #4 fileshare wd
                                self.taskcount,     #5
                                batfilename_rel,    #6
                                self.maxlen(str(distributable),50),      #7
                                self.priority,      #8
                                self.unit,          #9 -- not used anymore,. Instead #12 sets unit
                                "{",                #10
                                "}",                #11
                                self.numString(),   #12
                                excluded_nodes,     #13
                                ' -templateName "{0}"'.format(self.template) if self.template is not None else "", #14
                                self.runtime,       #15 RuntimeSeconds
                                ' -NodeGroups "{0}"'.format(self.nodegroups) if self.nodegroups is not None else "", #16
                                nodelocalwd,        #17 the node-local wd
                                batfilename_rel[0:-8]+"nodeprep.bat", #18
                                batfilename_rel[0:-8]+"noderelease.bat", #19
                                1 if self.node_local else 0,             #20
                                "",                                      #21 always run release task
                                self.preemptable,                        #22
                                '$true' if self.FailOnTaskFailure else '$false',   #23
                                ))
        assert batfilename_rel[-8:] == "dist.bat", "real assert"
        import subprocess
        proc = subprocess.Popen(["powershell.exe", "-ExecutionPolicy", "Unrestricted", psfilename_abs], cwd=os.getcwd())
        if not 0 == proc.wait(): raise Exception("Running powershell cluster submit script results in non-zero return code")
    def reduce_result(self, loss_cv, k_values, delta_values, strategy, output_prefix,best_delta_for_k, label="mse", create_pdf=True):
        """
        turn cross-validation results into results
        """
        #self.run_once() #Don't need and saves time

        # average over splits
        average_loss = np.array(loss_cv).mean(axis=0)
        #average_loss = np.vstack(loss_cv).mean(axis=0)
        best_ln_delta_interp, best_obj_interp, best_delta_interp = (None,None,None)

        # reconstruct results
        if strategy == "lmm_full_cv":
            # save cv scores
            if output_prefix != None:
                split_idx = ["mean"]*len(k_values)
                for idx in xrange(len(loss_cv)):
                    split_idx.extend([idx]*loss_cv[idx].shape[0])
                                
                stacked_result = np.vstack(loss_cv)
                stacked_result = np.vstack((average_loss, stacked_result))
                
                out_fn = output_prefix + "_" + label  + ".csv"
                cols = pd.MultiIndex.from_arrays([split_idx, k_values*(self.num_folds+1)], names=['split_id','k_value'])
                df = pd.DataFrame(stacked_result, columns=delta_values, index=cols)
                util.create_directory_if_necessary(out_fn)
                df.to_csv(out_fn, column_label="delta")
            
            # make sure delta is not at the boundary for any k
            assert average_loss.shape[0] == len(k_values)
            for k_idx in xrange(average_loss.shape[0]):
                tmp_idx = np.argmin(average_loss[k_idx])
                
                if tmp_idx == 0 or tmp_idx == len(delta_values)-1:
                    logging.warn("(select by %s): ln_delta for k=%i is at the boundary (idx=%i) of defined delta grid" % (label, k_values[k_idx], tmp_idx))
            
            best_k_idx, best_delta_idx = np.unravel_index(average_loss.argmin(), average_loss.shape)
            best_k, best_delta = k_values[best_k_idx], delta_values[best_delta_idx]
            best_obj = average_loss[best_k_idx, best_delta_idx]
            best_ln_delta = np.log(best_delta)
            best_str = "best: k=%i, ln_d=%.1f, obj=%.2f" % (best_k, best_ln_delta, best_obj)
            
            # fit parabola to 3 points in logspace
            if self.interpolate_delta:
                if best_delta_idx!=0 and best_delta_idx!=len(delta_values)-1:
                    log_deltas = [np.log(d) for d in delta_values[best_delta_idx-1:best_delta_idx+2]]
                    error_3pt = average_loss[best_k_idx, best_delta_idx-1:best_delta_idx+2]
                    
                    best_ln_delta_interp, best_obj_interp = self.fit_parabola(log_deltas, error_3pt, output_prefix=None)
                    best_delta_interp = sp.exp(best_ln_delta_interp)
                    best_str += ", ln_d_interp=%.2f" % (best_ln_delta_interp)
                    logging.info("best interpolated ln_delta {0}".format(best_ln_delta_interp))
                else:
                    logging.warn("(select by %s): best ln_delta for all k is at the boundary (idx=%i) of search grid, please consider a larger grid" % (label, best_delta_idx))
                    #if output_prefix != None:
                        #create a size-zero file so that the cluster will aways have something to copy
                        #plot_fn=output_prefix+"_parabola.pdf"
                        #util.create_directory_if_necessary(plot_fn)
                        #open(plot_fn, "w").close()

            # save cv scores
            if create_pdf and (output_prefix != None):
                # visualize results
                import matplotlib
                matplotlib.use('Agg') #This lets it work even on machines without graphics displays
                import pylab
                pylab.figure()
                ax = pylab.subplot(111)
                try:
                    for delta_idx, delta in enumerate(delta_values):
                        ln_delta = sp.log(delta)
                        ax.semilogx(k_values, average_loss[:,delta_idx], "-x", label="ln_d=%.1f" % (ln_delta))

                    # Shrink current axis by 20%
                    box = ax.get_position()
                    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

                    #TODO: this assumes the k_values are sorted:
                    pylab.ylim(ymax=average_loss[0].max() + abs(average_loss[0].max())*0.05 )
                    if k_values[0] != 0: logging.warn("Expect the first k value to be zero") #!!move this change earlier
                    for i in xrange(len(k_values)):
                        if k_values[i] == 0:
                            ax.axhline(average_loss[i].max(), color = 'green')
                            mymin = average_loss.min() 
                            mymax = average_loss[i].max()
                            diff = (mymax-mymin)*0.05
                            pylab.ylim([mymin-diff,mymax+diff])                

                    # Put a legend to the right of the current axis
                    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

                    pylab.title(best_str)
                    pylab.ylabel(label)
                    pylab.xlabel("k")
                    pylab.grid(True)
                    #pylab.show()
                except:
                    pass
                xval_fn = output_prefix + "_xval_%s.pdf" % label
                util.create_directory_if_necessary(xval_fn)
                pylab.savefig(xval_fn)
        elif strategy == "insample_cv":
            best_k_idx = average_loss.argmin()
            best_k = k_values[best_k_idx]
            best_obj = average_loss[best_k_idx]

            # check if unique over folds
            delta_array = np.array(best_delta_for_k)
            unique_deltas_for_k = set(delta_array[:,best_k_idx])
            if len(unique_deltas_for_k) > 1:
                logging.warn("ambiguous choice of delta for k: {0} {1}".format(best_k, unique_deltas_for_k))

            best_delta = np.median(delta_array[:,best_k_idx])

            best_str = "best k=%i, best delta=%.2f" % (best_k, best_delta)
            logging.info(best_str)
            if output_prefix != None:
                split_idx = ["mean"]*len(k_values)
                for idx in xrange(len(loss_cv)):
                    split_idx.extend([idx]*loss_cv[idx].shape[0])
                                
                stacked_result = np.vstack(loss_cv)
                stacked_result = np.vstack((average_loss, stacked_result))
                out_fn = output_prefix + "_" + label  + ".csv"
                cols = pd.MultiIndex.from_arrays([split_idx, k_values*(self.num_folds+1)], names=['split_id','k_value'])
                print "Christoph: bug, this is a quick fix that runs but may write out wrong results"
                df = pd.DataFrame(stacked_result.flatten()[:, None], columns=[label], index=cols)
                util.create_directory_if_necessary(out_fn)
                df.to_csv(out_fn, column_label="delta")
            if create_pdf and (output_prefix != None):
                # visualize results
                import matplotlib
                matplotlib.use('Agg') #This lets it work even on machines without graphics displays
                import pylab
                pylab.figure()
                ax = pylab.subplot(111)
                try:
                    ax.semilogx(k_values, average_loss, "-x", label="loo")

                    # shrink current axis by 20%
                    box = ax.get_position()
                    #TODO: this assumes the k_values are sorted:
                    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
                    pylab.ylim(ymax=average_loss[0].max() + abs(average_loss[0].max())*0.05 )
                    if k_values[0] != 0: logging.warn("Expect the first k value to be zero") #!!move this change earlier
                    for i in xrange(len(k_values)):
                        if k_values[i] == 0:
                            ax.axhline(average_loss[i].max(), color = 'green')
                            mymin =average_loss.min() 
                            mymax = average_loss[i].max()
                            diff = (mymax-mymin)*0.05
                            pylab.ylim([mymin-diff,mymax+diff])
                    # Put a legend to the right of the current axis
                    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
                    pylab.title(best_str)
                    pylab.ylabel(label)
                    pylab.xlabel("k")
                    pylab.grid(True)
                except:
                    pass
                plot_fn = output_prefix + "_xval_%s.pdf"%label
                util.create_directory_if_necessary(plot_fn)
                pylab.savefig(plot_fn)
        else:
            raise NotImplementedError(strategy)
        return best_k, best_delta, best_obj, best_delta_interp, best_obj_interp
Example #51
0
    def submit_to_cluster(self, batfilename_rel_list,
                          fileInWorkingDirectoryList, tgzList,
                          tgzListPythonPath, tgzListPythonSettings,
                          distributable, remotewd, run_dir_abs, run_dir_rel):
        logging.info('Hadoop runner is submitting to cluster')

        #!! e.g. hdfs://rr1-n13-02-c02/user/carlk/inputs.tgz#inputs,hdfs://rr1-n13-02-c02/user/carlk/datasets.tgz#datasets,hdfs://rr1-n13-02-c02/user/carlk/src.tgz#src
        #!! could do this functionally
        archivesStringList = []
        for tgz in tgzList:
            archiveString = "hdfs:{0}#{1}".format(tgz[1],
                                                  os.path.splitext(tgz[0])[0])
            archivesStringList.append(archiveString)
        archivesStringList.append(tgzListPythonSettings)
        for tgz in tgzListPythonPath:
            archivesStringList.append(tgz)

        #e.g. distMapper.bat,distReducer.bat
        filesString = ",".join(batfilename_rel_list +
                               fileInWorkingDirectoryList)

        taskIndexDir = run_dir_rel + os.path.sep + "input"
        util.create_directory_if_necessary(taskIndexDir, isfile=False)

        #zgoal = int(SP.ceil(SP.log(self.taskcount)/SP.log(10)))
        with open(taskIndexDir + os.path.sep + "taskIndexList.txt",
                  "w") as taskIndexListFile:
            for taskIndex in xrange(self.taskcount):
                taskIndexListFile.write(
                    "{0}\n".format(taskIndex))  # str(taskIndex).zfill(zgoal)))

        #hadoop fs -rmr runs/2013-08-02_13_51_42
        #hadoop fs -copyFromLocal runs\2013-08-02_13_51_42 runs/2013-08-02_13_51_42
        #hadoop jar %HADOOP_HOME%\lib\hadoop-streaming.jar ^
        #        -archives "hdfs:/user/carlk/source/carlkextranet05312013/ERG01/src/tests/datasets.2013-07-31_11_12_11.tgz#datasets,hdfs:/user/carlk/runs/pythonpath.0.src.2013-07-31_14_30_56/src.tgz#pythonpath.0.src" ^
        #        -files "hdfs:/user/carlk/runs/2013-08-02_13_51_42/distMapper.bat,hdfs:/user/carlk/runs/2013-08-02_13_51_42/distReducer.bat,hdfs:/user/carlk/runs/2013-08-02_13_51_42/distributable.p" ^
        #        -input "runs/2013-08-02_13_51_42/input" ^
        #        -output "runs/2013-08-02_13_51_42/output" ^
        #        -mapper "distMapper.bat" ^
        #       -reducer "distReducer.bat"
        #hadoop fs -cat runs/2013-08-02_13_51_42/output/part-00000  | more
        s00 = r"%HADOOP_HOME%\bin\hadoop fs -rmr -skipTrash {0}".format(
            run_dir_rel.replace("\\", "/"))
        s0 = r"%HADOOP_HOME%\bin\hadoop fs -copyFromLocal {0} {1}".format(
            run_dir_rel, run_dir_rel.replace("\\", "/"))

        #-D mapreduce.reduce.shuffle.connect.timeout=3600000 ^
        #-D io.sort.mb=1400 ^
        #-D job.end.retry.interval=3600000 ^
        #-D mapred.tasktracker.expiry.interval=3600000 ^

        logging.info("running {0}".format(str(distributable)))

        s = r"""%HADOOP_HOME%\bin\hadoop jar %HADOOP_HOME%\lib\hadoop-streaming.jar ^
        -archives "{0}" ^
        -files "{1}" ^
        -D mapred.job.name="{8}" ^
        -D mapred.map.tasks={4} ^
        -D mapred.reduce.tasks=1 ^
        -D mapred.job.map.memory.mb={5} ^
        -D mapred.job.reduce.memory.mb={6} ^
        -D mapred.task.timeout={7} ^
        -D mapred.job.queue.name="{9}" ^
        -input {2} ^
        -output {3} ^
        -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat ^
        -mapper "distMapper.bat" ^
        -reducer "distReducer.bat"
            """.format(
            ",".join(archivesStringList),  #0
            filesString,  #1
            taskIndexDir.replace("\\", "/"),  #2
            (run_dir_rel + os.path.sep + "output").replace("\\", "/"),  #3
            self.taskcount,  #4
            self.mapmemory,  #5
            self.reducememory,  #6
            0,  #7
            str(distributable),  #8
            self.queue  #9
        )
        runHadoopFileName = run_dir_rel + os.path.sep + "runHadoop.bat"
        logging.info(
            "Hadoop runner is creating '{0}'".format(runHadoopFileName))
        with open(runHadoopFileName, "w") as runHadoopFile:
            runHadoopFile.write("call {0}\n".format(s00))
            runHadoopFile.write("call {0}\n".format(s0))
            runHadoopFile.write("call {0}\n".format(s))

        sOneLine = "".join(s.split("^\n"))

        logging.info("Hadoop runner running the copyFromLocal")
        with TemporaryFile() as output:
            stdout0 = subprocess.check_output(s0, stderr=output, shell=True)
            output.seek(0)
            stderr0 = output.read()
        logging.info(
            "Result from 'Hadoop runner running the copyFromLocal' is stdout='{0}', stderr='{1}'"
            .format(stdout0, stderr0))
        if stderr0 != "":
            raise Exception("Stderr from command: '{0}'".format(stderr0))
        logging.info("Hadoop runner running the streamingjar")

        with TemporaryFile() as output:
            stdout = subprocess.check_output(sOneLine,
                                             stderr=output,
                                             shell=True)
            output.seek(0)
            stderr = output.read()
        logging.info(
            "Result from 'Hadoop runner running the streamingjar' is stdout='{0}', stderr='{1}'"
            .format(stdout, stderr))
        logging.info('Done: Hadoop runner is submitting to cluster')