Esempio n. 1
0
 def CheckRemoveHDir(self, directory, remove):
   nodir = False
   if (self.CheckHDir(directory)):
     if (remove):
       rm_str = self.hadooprmr_ + directory
       pyutil.runCommand(rm_str)
       nodir = True
   else:
     nodir = True
   return nodir
Esempio n. 2
0
 def CheckInputFile(self, inputfile, hdfsinputdir, outputdir, force, uncompress):
   input_file_list = ""
   if (inputfile.endswith(".gz") and uncompress):
     input_filename = os.path.basename(inputfile).replace(".gz","")
   else:
     input_filename = os.path.basename(inputfile)
   pyutil.printDebug(1, "Processing input " + input_filename + "\n")
   # Copy the input data to HDFS
   # Check that the input data exists and move to HDFS if necessary.
   hdfsinputfile = hdfsinputdir + "/" + input_filename
   if (not self.CheckHDFSFile(hdfsinputfile) or force):
     pyutil.printInfo("Regenerating HDFS input: " + hdfsinputfile)
     if (not self.CheckHDir(hdfsinputdir)):
       pyutil.runCommand(self.hadoopmkdir_ + hdfsinputdir)
     if (inputfile.endswith(".gz") and uncompress):
       new_input = outputdir + "/" + input_filename
       unzipcmd = "gunzip -c " + inputfile + " > " + new_input
       if (pyutil.runCommand(unzipcmd) != 0):
         pyutil.printError(12, "Unable to unzip file: " + inputfile)
       pyutil.runCommand(self.hadoopmove_ + new_input + " " + hdfsinputdir)
       input_file_list += " --input " + hdfsinputdir + "/" + input_filename
     else:
       pyutil.runCommand(self.hadoopput_ + inputfile + " " + hdfsinputdir)
       input_file_list += " --input " + hdfsinputdir + "/" + input_filename
     if (not self.CheckHDFSFile(hdfsinputfile)):
       pyutil.printError(10, "Unable to create input on HDFS: " + hdfsinputfile)
   else:
     input_file_list += " --input " + hdfsinputdir + "/" + input_filename
     pyutil.printDebug(5, "Found file on HDFS: " + hdfsinputdir + "/" + input_filename)
   return input_file_list
Esempio n. 3
0
 def RunMR(self, input_files, outputdir, reduce_tasks, reducer, mapper, mroptions):
   mr_str = self.hadoopmr_
   if (mroptions):
     mr_str += mroptions + " "
   mr_str += self.hadooplibpath_ + input_files
   if not reducer:
     mr_str += " -numReduceTasks 0 --reducer None "
     #mr_str += " -numReduceTasks 100 --reducer cat "
   else:
     if (int(reduce_tasks) >= 0):
       mr_str += " -numReduceTasks " + str(reduce_tasks)
     mr_str += " --reducer " + reducer
   mr_str += " --output " + outputdir
   mr_str += " --mapper " + mapper
   pyutil.printInfo("Running MR on: " + input_files)
   if (pyutil.runCommand(mr_str) != 0):
     pyutil.printError(33, "Error running MR" + mr_str)
Esempio n. 4
0
 def CheckHDFSFile(self, filename):
   hdinput_test = self.hadooptest_ + "-e " + filename
   return (pyutil.runCommand(hdinput_test) == 0)
Esempio n. 5
0
 def CheckHDir(self, directory):
   test_str = self.hadooptest_ + "-d " + directory
   return (pyutil.runCommand(test_str) == 0)
Esempio n. 6
0
 def CatPipe(self, hdfsfiles, pipecmd):
   catcmd = self.hadoopcat_ + " " + hdfsfiles + " | " + pipecmd
   if(pyutil.runCommand(catcmd)):
     pyutil.printError(34, "Error: " + catcmd)