def CheckRemoveHDir(self, directory, remove): nodir = False if (self.CheckHDir(directory)): if (remove): rm_str = self.hadooprmr_ + directory pyutil.runCommand(rm_str) nodir = True else: nodir = True return nodir
def CheckInputFile(self, inputfile, hdfsinputdir, outputdir, force, uncompress): input_file_list = "" if (inputfile.endswith(".gz") and uncompress): input_filename = os.path.basename(inputfile).replace(".gz","") else: input_filename = os.path.basename(inputfile) pyutil.printDebug(1, "Processing input " + input_filename + "\n") # Copy the input data to HDFS # Check that the input data exists and move to HDFS if necessary. hdfsinputfile = hdfsinputdir + "/" + input_filename if (not self.CheckHDFSFile(hdfsinputfile) or force): pyutil.printInfo("Regenerating HDFS input: " + hdfsinputfile) if (not self.CheckHDir(hdfsinputdir)): pyutil.runCommand(self.hadoopmkdir_ + hdfsinputdir) if (inputfile.endswith(".gz") and uncompress): new_input = outputdir + "/" + input_filename unzipcmd = "gunzip -c " + inputfile + " > " + new_input if (pyutil.runCommand(unzipcmd) != 0): pyutil.printError(12, "Unable to unzip file: " + inputfile) pyutil.runCommand(self.hadoopmove_ + new_input + " " + hdfsinputdir) input_file_list += " --input " + hdfsinputdir + "/" + input_filename else: pyutil.runCommand(self.hadoopput_ + inputfile + " " + hdfsinputdir) input_file_list += " --input " + hdfsinputdir + "/" + input_filename if (not self.CheckHDFSFile(hdfsinputfile)): pyutil.printError(10, "Unable to create input on HDFS: " + hdfsinputfile) else: input_file_list += " --input " + hdfsinputdir + "/" + input_filename pyutil.printDebug(5, "Found file on HDFS: " + hdfsinputdir + "/" + input_filename) return input_file_list
def RunMR(self, input_files, outputdir, reduce_tasks, reducer, mapper, mroptions): mr_str = self.hadoopmr_ if (mroptions): mr_str += mroptions + " " mr_str += self.hadooplibpath_ + input_files if not reducer: mr_str += " -numReduceTasks 0 --reducer None " #mr_str += " -numReduceTasks 100 --reducer cat " else: if (int(reduce_tasks) >= 0): mr_str += " -numReduceTasks " + str(reduce_tasks) mr_str += " --reducer " + reducer mr_str += " --output " + outputdir mr_str += " --mapper " + mapper pyutil.printInfo("Running MR on: " + input_files) if (pyutil.runCommand(mr_str) != 0): pyutil.printError(33, "Error running MR" + mr_str)
def CheckHDFSFile(self, filename): hdinput_test = self.hadooptest_ + "-e " + filename return (pyutil.runCommand(hdinput_test) == 0)
def CheckHDir(self, directory): test_str = self.hadooptest_ + "-d " + directory return (pyutil.runCommand(test_str) == 0)
def CatPipe(self, hdfsfiles, pipecmd): catcmd = self.hadoopcat_ + " " + hdfsfiles + " | " + pipecmd if(pyutil.runCommand(catcmd)): pyutil.printError(34, "Error: " + catcmd)