Esempio n. 1
0
def main(args):
    conf = JobConf(WordCountMap)
    conf.setJobName("wordcount")

    conf.setOutputKeyClass(Text)
    conf.setOutputValueClass(IntWritable)

    conf.setMapperClass(WordCountMap)
    conf.setCombinerClass(Summer)
    conf.setReducerClass(Summer)
    try:
        flags, other_args = getopt.getopt(args[1:], "m:r:")
    except getopt.GetoptError:
        printUsage(1)
    if len(other_args) != 2:
        printUsage(1)

    for f, v in flags:
        if f == "-m":
            conf.setNumMapTasks(int(v))
        elif f == "-r":
            conf.setNumReduceTasks(int(v))
    conf.setInputPath(Path(other_args[0]))
    conf.setOutputPath(Path(other_args[1]))
    JobClient.runJob(conf)
Esempio n. 2
0
def main(args):
    if len(args) < 6:
        printUsage(1);

    inDir = args[1];
    outDir = args[2];
    numOfReducers = int(args[3]);
    theInputFormat = args[4];
    specFile = args[5];
                                        
    print "numOfReducers: ", numOfReducers, "theInputFormat: ", theInputFormat, "specFile: ", specFile

    conf = JobConf(AbacusMapper);
    conf.setJobName("recordcount");
    conf.addDefaultResource(Path(specFile));
 
    if theInputFormat=="textinputformat":
        conf.setInputFormat(TextInputFormat);
    else:
        conf.setInputFormat(SequenceFileInputFormat);
    conf.setOutputFormat(TextOutputFormat);
    conf.setMapOutputKeyClass(Text);
    conf.setMapOutputValueClass(Text);
    conf.setOutputKeyClass(Text);
    conf.setOutputValueClass(Text);
    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(numOfReducers);

    conf.setMapperClass(AbacusMapper);        
    conf.setCombinerClass(AbacusCombiner);
    conf.setReducerClass(AbacusReducer);
    conf.setInputPath(Path(args[1]))
    conf.setOutputPath(Path(args[2]))

    JobClient.runJob(conf);
Esempio n. 3
0
 def hdfs_folder_exists(self, folder):
     path = Path(folder)
     fs = path.getFileSystem(Configuration())
     try:
         status = fs.getFileStatus(path)
         # TODO: there could be problems if it exists but is a simple file
         return status.isDir()
     except:
         return False
Esempio n. 4
0
 def hdfs_folder_exists(self, folder):
     path = Path(folder)
     fs = path.getFileSystem(Configuration())
     try:
         status = fs.getFileStatus(path)
         # TODO: there could be problems if it exists but is a simple file
         return status.isDir()
     except:
         return False
Esempio n. 5
0
 def createInputFile(self, fs, fileName, input_data):
     if (fs.exists(Path(fileName))):
         raise IOException("File " + fileName +
                           " already exists on the minicluster")
     stream = fs.create(Path(fileName))
     pw = PrintWriter(OutputStreamWriter(stream, "UTF-8"))
     for i in xrange(len(input_data)):
         pw.println(input_data[i])
     pw.close()
Esempio n. 6
0
 def mv(self, srcfpath, trgfpath):
     try:
         sp = Path(srcfpath)
         tp = Path(trgfpath)
         #  Needs work...
         self.fsHd.rename(sp, tp)
     except JException as ex:
         self.logger.error("Exception in HdfsUtil.mv({}): ex[{}]".format(
             fpath, ex))
Esempio n. 7
0
    def _open_write(self, path):
        p = Path(path)

        dirname = p.getParent()
        self._fs.mkdirs(dirname)

        key = self._key
        value = self._value

        writer = SequenceFile.createWriter(self._fs, self._conf, p, key.getClass(), value.getClass())
        return WrappedWriter(writer, key, value)
Esempio n. 8
0
    def __init__(self, hdfsCluster):
        self.logger = Logger.getLogger("Hdfs")
        # self.logger.setLevel(Level.DEBUG)

        coreSite = "/etc/hadoop/conf/core-site.xml"
        hdfsSite = "/etc/hadoop/conf/hdfs-site.xml"
        hdfsCluster = hdfsCluster
        self.cHdfs = Configuration()
        self.cHdfs.addResource(Path(coreSite))
        self.cHdfs.addResource(Path(hdfsSite))
        self.cHdfs.set("fs.defaultFS", hdfsCluster)
        self.fileSystem = FileSystem.get(self.cHdfs)
        self.fileUtil = FileUtil()
Esempio n. 9
0
 def lsIterator(self, fpath):
     ''' Returns an iterator that returns files and dirs
         w/in a given dir path (no recursion).
     '''
     p = Path(fpath)
     self.lsListIterator = self.fsHd.listLocatedStatus(p)
     return self.lsListIterator
Esempio n. 10
0
 def lsFileIterator(self, fpath, recurse=False):
     ''' Returns an iterator that returns files (only)
         w/in a given dir path (w/the option for recursion).
     '''
     p = Path(fpath)
     self.fileListIterator = self.fsHd.listFiles(p, recurse)
     return self.fileListIterator
Esempio n. 11
0
 def isDir(self, fpath):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.isDirectory(fp)
     except JException as ex:
         self.logger.error("Exception in HdfsUtil.isDir({}): ex[{}]".format(
             fpath, ex))
Esempio n. 12
0
 def setOwner(self, fpath, user, group):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.setOwner(fp, user, group)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.setOwner({}): ex[{}]".format(fpath, ex))
Esempio n. 13
0
def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.
    
    If we specified a relative path, prepend it with the home folder
    of the user on HDFS. If we are running in local mode, don't do anything.
    
    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe

    if pycascading.pipe.running_mode == "hadoop":
        if output_folder == "" or (output_folder[0:5] not in set(["hdfs:", "file:"]) and output_folder[0] != "/"):
            fs = Path("/").getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + "/" + output_folder
    return output_folder
Esempio n. 14
0
def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.

    Only if we specified a relative path and no scheme, prepend it with the
    home folder of the user on HDFS. This behavior is similar to how
    "hadoop fs" works. If we are running in local mode, don't do anything.

    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop':
        if not any(map(lambda scheme: output_folder.startswith(scheme), \
                       ['hdfs:', 'file:', 's3:', 's3n:', '/'])):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder
Esempio n. 15
0
def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.

    Only if we specified a relative path and no scheme, prepend it with the
    home folder of the user on HDFS. This behavior is similar to how
    "hadoop fs" works. If we are running in local mode, don't do anything.

    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop':
        if not any(map(lambda scheme: output_folder.startswith(scheme), \
                       ['hdfs:', 'file:', 's3:', 's3n:', '/'])):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder
Esempio n. 16
0
 def touch(self, fpath):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         os = self.fsHd.create(fp)
         os.close()
     except JException as ex:
         self.logger.error("Exception in HdfsUtil.touch({}): ex[{}]".format(
             fpath, ex))
Esempio n. 17
0
 def getFileStat(self, fpath):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.getFileStatus(fp)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.getFileStat({}): ex[{}]".format(
                 fpath, ex))
Esempio n. 18
0
 def setPerms(self, fpath, perms):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         fsPerm = FsPermission(perms)
         return self.fsHd.setPerms(fp, perms)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.setPerms({}): ex[{}]".format(fpath, ex))
Esempio n. 19
0
 def ls(self, fpath):
     ''' Returns a list of all files in a given dir.
         This file list can be very long and take lots of memory.
         Use lsIterator or lsFileIterator instead to minimize
         memory usage.
     '''
     p = Path(fpath)
     self.fileList = self.fsHd.listStatus(p)
     return self.fileList
Esempio n. 20
0
def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.
    
    If we specified a relative path, prepend it with the home folder
    of the user on HDFS. If we are running in local mode, don't do anything.
    
    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.running_mode == 'hadoop':
        if output_folder == '' or \
        (output_folder[0 : 5] not in set(['hdfs:', 'file:']) and \
         output_folder[0] != '/'):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder
Esempio n. 21
0
 def setRep(self, fpath, replication):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.setReplication(fp, replication)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.setReplication({}): ex[{}]".format(
                 fpath, ex))
Esempio n. 22
0
    def mkdir(self, fpath, perms=755):
        try:
            p = Path(fpath)
            permObj = FsPermission("{}".format(perms))
            retVal = self.fsHd.mkdirs(p, permObj)
            if retVal == False:
                self.logger.error(
                    "HdfsUtil.mkdir({}): Failed to create dir.".format(
                        fpath, ex))

        except JException as ex:
            self.logger.error("Exception in HdfsUtil.mkdir({}): ex[{}]".format(
                fpath, ex))
Esempio n. 23
0
    def cp(self, srcfpath, trgfpath):
        "Copy data within the current HDFS."
        try:
            sp = []
            fileList = self.fsHd.globStatus(Path(srcfpath))
            if ((fileList is None) or (fileList.__len__() == 0)):
                # Emit and error: No files found for srcfPath
                None
            for sfp in fileList:
                sp.append(sfp.getPath())

            sfs = FileSystem.newInstance(self.hdfs.cHdfs)
            tp = Path(trgfpath)
            tfs = FileSystem.newInstance(self.hdfs.cHdfs)
            delSrc = False
            overWrite = True
            self.hdfs.fileUtil.copy(sfs, sp, tfs, tp, delSrc, overWrite,
                                    self.hdfs.cHdfs)
        except JException as ex:
            self.logger.error(
                "Exception in HdfsUtil.cp({} -> {}): ex[{}]".format(
                    srcfpath, trgfpath, ex))
Esempio n. 24
0
    def exists(self, fpath):
        try:
            # Create an empty file on HDFS
            sp = []
            fileList = self.fsHd.globStatus(Path(fpath))
            if ((fileList is None) or (fileList.__len__() == 0)):
                # Emit and error: No files found for srcfPath
                return False

            return True

        except JException as ex:
            self.logger.error(
                "Exception in HdfsUtil.exists({}): ex[{}]".format(fpath, ex))
Esempio n. 25
0
    def openRead(self, fPath):
        fpHdfs = Path(fPath)
        fsInput = self.fsHd.open(fpHdfs)

        reader = None
        pat = r'.*\.gz'
        match = re.search(pat, hst)
        if match == None:
            reader = BufferedReader(InputStreamReader(fsInput))
        else:
            # The file stream is in GZip format...
            reader = BufferedReader(InputStreamReader(
                GZIPInputStream(fsInput)))

        return reader
Esempio n. 26
0
    def testStore(self):
        from tempfile import mktemp
        tempdir = mktemp()
        outfile = tempdir + '/top_3_queries'
        args = [
            "n=3",
            "reducers=1",
            "input=" + self.INPUT_FILE,
            "output=" + outfile,
        ]
        proxy = PigProxy.from_file(self.PIG_SCRIPT, args)

        # By default all STORE and DUMP commands are removed
        proxy.unoverride("STORE")
        proxy.run_script()
        cluster = Cluster(proxy.pig.getPigContext())
        self.assert_(cluster.delete(Path(outfile)))
Esempio n. 27
0
    def __init__(self, hdfsCluster, fpath):
        self.hdfs = Hdfs(hdfsCluster)
        self.fsHd = self.hdfs.fileSystem

        fpHdfs = Path(fpath)
        fsInput = self.fsHd.open(fpHdfs)
        # The file has text so we want to use read the input stream via the BufferedReader.
        reader = BufferedReader(InputStreamReader(fsInput))
        self.lineCount = 0
        self.lines = []
        line = reader.readLine()
        while line is not None:
            # print line
            self.lines.append(line)
            self.lineCount = self.lineCount + 1
            if ((self.lineCount % 1000) == 0):
                print self.lineCount
            line = reader.readLine()
Esempio n. 28
0
    def rm(self, fpath, **kwargs):
        try:
            sp = []
            fileList = self.fsHd.globStatus(Path(fpath))
            if (fileList is None):
                # self.logger.warn("No Files found in: [{}]".format(fpath))
                return

            if 'recurse' in kwargs:
                recurse = kwargs['recurse']
            else:
                recurse = False

            for sfp in fileList:
                self.fsHd.delete(sfp.getPath(), recurse)

        except JException as ex:
            self.logger.error("Exception in HdfsUtil.rm({}): ex[{}]".format(
                fpath, ex))
Esempio n. 29
0
#  Licensed to the Apache Software Foundation (ASF) under one or more
#  contributor license agreements.  See the NOTICE file distributed with
#  this work for additional information regarding copyright ownership.
#  The ASF licenses this file to You under the Apache License, Version 2.0
#  (the "License"); you may not use this file except in compliance with
#  the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
from org.apache.hadoop.fs import Path  # Test for PIG-1824
p = Path('foo')


@outputSchemaFunction("squareSchema")
def square(num):
    if num == None:
        return None
    return ((num) * (num))


@schemaFunction("squareSchema")
def squareSchema(input):
    return input


@outputSchema("word:chararray")
Esempio n. 30
0
 def copyContentFromLocalFile(self, content, dest_path, overwrite = True):
     file_path = Path(dest_path)
     fs = file_path.getFileSystem(self.configuration)
     if overwrite and fs.exists(file_path):
         fs.delete(file_path, True)
     self.createInputFile(fs, dest_path, content)
Esempio n. 31
0
 def copyContentFromLocalFile(self, content, dest_path, overwrite=True):
     file_path = Path(dest_path)
     fs = file_path.getFileSystem(self.configuration)
     if overwrite and fs.exists(file_path):
         fs.delete(file_path, True)
     self.createInputFile(fs, dest_path, content)
Esempio n. 32
0
from java.util import UUID
from org.apache.hadoop.fs import Path

print "Home dir is " + str(fs.homeDirectory)
print "Work dir is " + str(fs.workingDirectory)
print "/user exists " + str(fs.exists("/user"))

name = UUID.randomUUID().toString()
scriptName = "src/test/resources/test.properties"
fs.copyFromLocalFile(scriptName, name)
print Path(name).makeQualified(fs)

# use the shell
dir = "script-dir/"
if not fsh.test(dir):
	fsh.mkdir(dir)
	fsh.cp(name, dir)
	fsh.chmodr(700, dir)
	print "File content is " + str(fsh.cat(dir + name))


print str(fsh.ls(dir))
fsh.rmr(dir)
fs.getLength(name)
Esempio n. 33
0
import sys
import time

startTime = time.time()

if len(sys.argv) != 2:
    raise sys.argv[0] + ' <basedir>'

# Get reference to the Hadoop FileSystem object.  Everything we do in
# this script that interacts with HDFS is through this object.
fs = FileSystem.get(Configuration())

# Make sure the requested collection exists.
collection = sys.argv[1]
collectionDir = Path(collection)

if not fs.exists(collectionDir):
    print '\nERROR: no collection directory: %s' % collectionDir
    System.exit(1)

# Check for "guard" file.  Like a semaphore, ensures that we don't try
# to update this collection while it's in the middle of being updated.
# Since file creation in HDFS is atomic, we don't check for the existence
# of the guardFile, rather we try to create it.  If the file already exists
# then fs.createNewFile() will return False
guardFile = Path(collectionDir, '_updating')
if not fs.createNewFile(guardFile):
    print '\nERROR: collection update already in progress: %s' % guardFile
    System.exit(1)
Esempio n. 34
0
    def openWrite(self, fPath):
        fp = Path(fPath)
        fsOutput = self.fsHd.create(fp)

        return fsOutput