Ejemplo n.º 1
0
 def CheckInputFile(self, inputfile, hdfsinputdir, outputdir, force, uncompress):
   input_file_list = ""
   if (inputfile.endswith(".gz") and uncompress):
     input_filename = os.path.basename(inputfile).replace(".gz","")
   else:
     input_filename = os.path.basename(inputfile)
   pyutil.printDebug(1, "Processing input " + input_filename + "\n")
   # Copy the input data to HDFS
   # Check that the input data exists and move to HDFS if necessary.
   hdfsinputfile = hdfsinputdir + "/" + input_filename
   if (not self.CheckHDFSFile(hdfsinputfile) or force):
     pyutil.printInfo("Regenerating HDFS input: " + hdfsinputfile)
     if (not self.CheckHDir(hdfsinputdir)):
       pyutil.runCommand(self.hadoopmkdir_ + hdfsinputdir)
     if (inputfile.endswith(".gz") and uncompress):
       new_input = outputdir + "/" + input_filename
       unzipcmd = "gunzip -c " + inputfile + " > " + new_input
       if (pyutil.runCommand(unzipcmd) != 0):
         pyutil.printError(12, "Unable to unzip file: " + inputfile)
       pyutil.runCommand(self.hadoopmove_ + new_input + " " + hdfsinputdir)
       input_file_list += " --input " + hdfsinputdir + "/" + input_filename
     else:
       pyutil.runCommand(self.hadoopput_ + inputfile + " " + hdfsinputdir)
       input_file_list += " --input " + hdfsinputdir + "/" + input_filename
     if (not self.CheckHDFSFile(hdfsinputfile)):
       pyutil.printError(10, "Unable to create input on HDFS: " + hdfsinputfile)
   else:
     input_file_list += " --input " + hdfsinputdir + "/" + input_filename
     pyutil.printDebug(5, "Found file on HDFS: " + hdfsinputdir + "/" + input_filename)
   return input_file_list
Ejemplo n.º 2
0
 def RunMR(self, input_files, outputdir, reduce_tasks, reducer, mapper, mroptions):
   mr_str = self.hadoopmr_
   if (mroptions):
     mr_str += mroptions + " "
   mr_str += self.hadooplibpath_ + input_files
   if not reducer:
     mr_str += " -numReduceTasks 0 --reducer None "
     #mr_str += " -numReduceTasks 100 --reducer cat "
   else:
     if (int(reduce_tasks) >= 0):
       mr_str += " -numReduceTasks " + str(reduce_tasks)
     mr_str += " --reducer " + reducer
   mr_str += " --output " + outputdir
   mr_str += " --mapper " + mapper
   pyutil.printInfo("Running MR on: " + input_files)
   if (pyutil.runCommand(mr_str) != 0):
     pyutil.printError(33, "Error running MR" + mr_str)
Ejemplo n.º 3
0
 def CatPipeRead(self, hdfsfiles, pipecmd, retval):
   catcmd = self.hadoopcat_ + " " + hdfsfiles + " | " + pipecmd
   if (not pyutil.readCommand(catcmd, retval)):
     pyutil.printError(30, "Error running command " + catcmd)
Ejemplo n.º 4
0
 def CatPipe(self, hdfsfiles, pipecmd):
   catcmd = self.hadoopcat_ + " " + hdfsfiles + " | " + pipecmd
   if(pyutil.runCommand(catcmd)):
     pyutil.printError(34, "Error: " + catcmd)
Ejemplo n.º 5
0
    optParse.error("--outputdir option is required")

pyutil.DEBUG = options.verbosity

# Attempt to find the hadoop installation.
hadooproot = options.hadooproot
if not hadooproot:
    if os.path.isdir("/usr/lib/hadoop"):
        hadooproot = "/usr/lib/hadoop"
    elif os.path.isdir("/usr/local/lib/hadoop"):
        hadooproot = "/usr/local/lib/hadoop"
    elif os.path.isdir("/opt/lib/hadoop"):
        hadooproot = "/opt/lib/hadoop"
    else:
        pyutil.printError(
            10, "Unable to find the hadoop installation.  " +
            "Please specify with --hadooproot.")

streamingloc = options.streamingloc
if not streamingloc:
    if os.path.exists(hadooproot + "/hadoop-streaming.jar"):
        streamingloc = hadooproot + "/hadoop-streaming.jar"
    else:
        tmppath = hadooproot + "/contrib/streaming"
        if not os.path.isdir(tmppath):
            pyutil.printError(
                10, hadooproot + "/contrib/streaming does not exist.  " +
                "Please specify location of hadoop streaming jar file with " +
                "--streamingloc")
        streamingjar = glob.glob(tmppath + "/hadoop-streaming*.jar")
        if len(streamingjar) != 1:
Ejemplo n.º 6
0
if (not options.outputdir):
  optParse.error("--outputdir option is required")

pyutil.DEBUG = options.verbosity

# Attempt to find the hadoop installation.
hadooproot = options.hadooproot
if not hadooproot:
  if os.path.isdir("/usr/lib/hadoop"):
    hadooproot = "/usr/lib/hadoop"
  elif os.path.isdir("/usr/local/lib/hadoop"):
    hadooproot = "/usr/local/lib/hadoop"
  elif os.path.isdir("/opt/lib/hadoop"):
    hadooproot = "/opt/lib/hadoop"
  else:
    pyutil.printError(10, "Unable to find the hadoop installation.  " +
                      "Please specify with --hadooproot.")

streamingloc = options.streamingloc
if not streamingloc:
  if os.path.exists(hadooproot + "/hadoop-streaming.jar"):
    streamingloc = hadooproot + "/hadoop-streaming.jar"
  else:
    tmppath = hadooproot + "/contrib/streaming"
    if not os.path.isdir(tmppath):
      pyutil.printError(10, hadooproot + "/contrib/streaming does not exist.  " +
                        "Please specify location of hadoop streaming jar file with " +
                        "--streamingloc")
    streamingjar = glob.glob(tmppath + "/hadoop-streaming*.jar")
    if len(streamingjar) != 1:
      pyutil.printError(10, "Unable to find streaming jar, please specify with --streamingloc")
    streamingloc = streamingjar[0]