def CheckInputFile(self, inputfile, hdfsinputdir, outputdir, force, uncompress): input_file_list = "" if (inputfile.endswith(".gz") and uncompress): input_filename = os.path.basename(inputfile).replace(".gz","") else: input_filename = os.path.basename(inputfile) pyutil.printDebug(1, "Processing input " + input_filename + "\n") # Copy the input data to HDFS # Check that the input data exists and move to HDFS if necessary. hdfsinputfile = hdfsinputdir + "/" + input_filename if (not self.CheckHDFSFile(hdfsinputfile) or force): pyutil.printInfo("Regenerating HDFS input: " + hdfsinputfile) if (not self.CheckHDir(hdfsinputdir)): pyutil.runCommand(self.hadoopmkdir_ + hdfsinputdir) if (inputfile.endswith(".gz") and uncompress): new_input = outputdir + "/" + input_filename unzipcmd = "gunzip -c " + inputfile + " > " + new_input if (pyutil.runCommand(unzipcmd) != 0): pyutil.printError(12, "Unable to unzip file: " + inputfile) pyutil.runCommand(self.hadoopmove_ + new_input + " " + hdfsinputdir) input_file_list += " --input " + hdfsinputdir + "/" + input_filename else: pyutil.runCommand(self.hadoopput_ + inputfile + " " + hdfsinputdir) input_file_list += " --input " + hdfsinputdir + "/" + input_filename if (not self.CheckHDFSFile(hdfsinputfile)): pyutil.printError(10, "Unable to create input on HDFS: " + hdfsinputfile) else: input_file_list += " --input " + hdfsinputdir + "/" + input_filename pyutil.printDebug(5, "Found file on HDFS: " + hdfsinputdir + "/" + input_filename) return input_file_list
def RunMR(self, input_files, outputdir, reduce_tasks, reducer, mapper, mroptions): mr_str = self.hadoopmr_ if (mroptions): mr_str += mroptions + " " mr_str += self.hadooplibpath_ + input_files if not reducer: mr_str += " -numReduceTasks 0 --reducer None " #mr_str += " -numReduceTasks 100 --reducer cat " else: if (int(reduce_tasks) >= 0): mr_str += " -numReduceTasks " + str(reduce_tasks) mr_str += " --reducer " + reducer mr_str += " --output " + outputdir mr_str += " --mapper " + mapper pyutil.printInfo("Running MR on: " + input_files) if (pyutil.runCommand(mr_str) != 0): pyutil.printError(33, "Error running MR" + mr_str)
def CatPipeRead(self, hdfsfiles, pipecmd, retval): catcmd = self.hadoopcat_ + " " + hdfsfiles + " | " + pipecmd if (not pyutil.readCommand(catcmd, retval)): pyutil.printError(30, "Error running command " + catcmd)
def CatPipe(self, hdfsfiles, pipecmd): catcmd = self.hadoopcat_ + " " + hdfsfiles + " | " + pipecmd if(pyutil.runCommand(catcmd)): pyutil.printError(34, "Error: " + catcmd)
optParse.error("--outputdir option is required") pyutil.DEBUG = options.verbosity # Attempt to find the hadoop installation. hadooproot = options.hadooproot if not hadooproot: if os.path.isdir("/usr/lib/hadoop"): hadooproot = "/usr/lib/hadoop" elif os.path.isdir("/usr/local/lib/hadoop"): hadooproot = "/usr/local/lib/hadoop" elif os.path.isdir("/opt/lib/hadoop"): hadooproot = "/opt/lib/hadoop" else: pyutil.printError( 10, "Unable to find the hadoop installation. " + "Please specify with --hadooproot.") streamingloc = options.streamingloc if not streamingloc: if os.path.exists(hadooproot + "/hadoop-streaming.jar"): streamingloc = hadooproot + "/hadoop-streaming.jar" else: tmppath = hadooproot + "/contrib/streaming" if not os.path.isdir(tmppath): pyutil.printError( 10, hadooproot + "/contrib/streaming does not exist. " + "Please specify location of hadoop streaming jar file with " + "--streamingloc") streamingjar = glob.glob(tmppath + "/hadoop-streaming*.jar") if len(streamingjar) != 1:
if (not options.outputdir): optParse.error("--outputdir option is required") pyutil.DEBUG = options.verbosity # Attempt to find the hadoop installation. hadooproot = options.hadooproot if not hadooproot: if os.path.isdir("/usr/lib/hadoop"): hadooproot = "/usr/lib/hadoop" elif os.path.isdir("/usr/local/lib/hadoop"): hadooproot = "/usr/local/lib/hadoop" elif os.path.isdir("/opt/lib/hadoop"): hadooproot = "/opt/lib/hadoop" else: pyutil.printError(10, "Unable to find the hadoop installation. " + "Please specify with --hadooproot.") streamingloc = options.streamingloc if not streamingloc: if os.path.exists(hadooproot + "/hadoop-streaming.jar"): streamingloc = hadooproot + "/hadoop-streaming.jar" else: tmppath = hadooproot + "/contrib/streaming" if not os.path.isdir(tmppath): pyutil.printError(10, hadooproot + "/contrib/streaming does not exist. " + "Please specify location of hadoop streaming jar file with " + "--streamingloc") streamingjar = glob.glob(tmppath + "/hadoop-streaming*.jar") if len(streamingjar) != 1: pyutil.printError(10, "Unable to find streaming jar, please specify with --streamingloc") streamingloc = streamingjar[0]