コード例 #1
0
def main(args):
    conf = JobConf(WordCountMap)
    conf.setJobName("wordcount")

    conf.setOutputKeyClass(Text)
    conf.setOutputValueClass(IntWritable)

    conf.setMapperClass(WordCountMap)
    conf.setCombinerClass(Summer)
    conf.setReducerClass(Summer)
    try:
        flags, other_args = getopt.getopt(args[1:], "m:r:")
    except getopt.GetoptError:
        printUsage(1)
    if len(other_args) != 2:
        printUsage(1)

    for f, v in flags:
        if f == "-m":
            conf.setNumMapTasks(int(v))
        elif f == "-r":
            conf.setNumReduceTasks(int(v))
    conf.setInputPath(Path(other_args[0]))
    conf.setOutputPath(Path(other_args[1]))
    JobClient.runJob(conf)
コード例 #2
0
def main(args):
    if len(args) < 6:
        printUsage(1);

    inDir = args[1];
    outDir = args[2];
    numOfReducers = int(args[3]);
    theInputFormat = args[4];
    specFile = args[5];
                                        
    print "numOfReducers: ", numOfReducers, "theInputFormat: ", theInputFormat, "specFile: ", specFile

    conf = JobConf(AbacusMapper);
    conf.setJobName("recordcount");
    conf.addDefaultResource(Path(specFile));
 
    if theInputFormat=="textinputformat":
        conf.setInputFormat(TextInputFormat);
    else:
        conf.setInputFormat(SequenceFileInputFormat);
    conf.setOutputFormat(TextOutputFormat);
    conf.setMapOutputKeyClass(Text);
    conf.setMapOutputValueClass(Text);
    conf.setOutputKeyClass(Text);
    conf.setOutputValueClass(Text);
    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(numOfReducers);

    conf.setMapperClass(AbacusMapper);        
    conf.setCombinerClass(AbacusCombiner);
    conf.setReducerClass(AbacusReducer);
    conf.setInputPath(Path(args[1]))
    conf.setOutputPath(Path(args[2]))

    JobClient.runJob(conf);
コード例 #3
0
ファイル: tap.py プロジェクト: butzeb/pycascading
 def hdfs_folder_exists(self, folder):
     path = Path(folder)
     fs = path.getFileSystem(Configuration())
     try:
         status = fs.getFileStatus(path)
         # TODO: there could be problems if it exists but is a simple file
         return status.isDir()
     except:
         return False
コード例 #4
0
ファイル: tap.py プロジェクト: seanjensengrey/pycascading
 def hdfs_folder_exists(self, folder):
     path = Path(folder)
     fs = path.getFileSystem(Configuration())
     try:
         status = fs.getFileStatus(path)
         # TODO: there could be problems if it exists but is a simple file
         return status.isDir()
     except:
         return False
コード例 #5
0
 def createInputFile(self, fs, fileName, input_data):
     if (fs.exists(Path(fileName))):
         raise IOException("File " + fileName +
                           " already exists on the minicluster")
     stream = fs.create(Path(fileName))
     pw = PrintWriter(OutputStreamWriter(stream, "UTF-8"))
     for i in xrange(len(input_data)):
         pw.println(input_data[i])
     pw.close()
コード例 #6
0
 def mv(self, srcfpath, trgfpath):
     try:
         sp = Path(srcfpath)
         tp = Path(trgfpath)
         #  Needs work...
         self.fsHd.rename(sp, tp)
     except JException as ex:
         self.logger.error("Exception in HdfsUtil.mv({}): ex[{}]".format(
             fpath, ex))
コード例 #7
0
ファイル: hdfs.py プロジェクト: crankycoder/sandbox
    def _open_write(self, path):
        p = Path(path)

        dirname = p.getParent()
        self._fs.mkdirs(dirname)

        key = self._key
        value = self._value

        writer = SequenceFile.createWriter(self._fs, self._conf, p, key.getClass(), value.getClass())
        return WrappedWriter(writer, key, value)
コード例 #8
0
    def __init__(self, hdfsCluster):
        self.logger = Logger.getLogger("Hdfs")
        # self.logger.setLevel(Level.DEBUG)

        coreSite = "/etc/hadoop/conf/core-site.xml"
        hdfsSite = "/etc/hadoop/conf/hdfs-site.xml"
        hdfsCluster = hdfsCluster
        self.cHdfs = Configuration()
        self.cHdfs.addResource(Path(coreSite))
        self.cHdfs.addResource(Path(hdfsSite))
        self.cHdfs.set("fs.defaultFS", hdfsCluster)
        self.fileSystem = FileSystem.get(self.cHdfs)
        self.fileUtil = FileUtil()
コード例 #9
0
 def lsIterator(self, fpath):
     ''' Returns an iterator that returns files and dirs
         w/in a given dir path (no recursion).
     '''
     p = Path(fpath)
     self.lsListIterator = self.fsHd.listLocatedStatus(p)
     return self.lsListIterator
コード例 #10
0
 def lsFileIterator(self, fpath, recurse=False):
     ''' Returns an iterator that returns files (only)
         w/in a given dir path (w/the option for recursion).
     '''
     p = Path(fpath)
     self.fileListIterator = self.fsHd.listFiles(p, recurse)
     return self.fileListIterator
コード例 #11
0
 def isDir(self, fpath):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.isDirectory(fp)
     except JException as ex:
         self.logger.error("Exception in HdfsUtil.isDir({}): ex[{}]".format(
             fpath, ex))
コード例 #12
0
 def setOwner(self, fpath, user, group):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.setOwner(fp, user, group)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.setOwner({}): ex[{}]".format(fpath, ex))
コード例 #13
0
ファイル: tap.py プロジェクト: seanjensengrey/pycascading
def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.
    
    If we specified a relative path, prepend it with the home folder
    of the user on HDFS. If we are running in local mode, don't do anything.
    
    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe

    if pycascading.pipe.running_mode == "hadoop":
        if output_folder == "" or (output_folder[0:5] not in set(["hdfs:", "file:"]) and output_folder[0] != "/"):
            fs = Path("/").getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + "/" + output_folder
    return output_folder
コード例 #14
0
def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.

    Only if we specified a relative path and no scheme, prepend it with the
    home folder of the user on HDFS. This behavior is similar to how
    "hadoop fs" works. If we are running in local mode, don't do anything.

    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop':
        if not any(map(lambda scheme: output_folder.startswith(scheme), \
                       ['hdfs:', 'file:', 's3:', 's3n:', '/'])):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder
コード例 #15
0
ファイル: tap.py プロジェクト: ArturFis/pycascading
def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.

    Only if we specified a relative path and no scheme, prepend it with the
    home folder of the user on HDFS. This behavior is similar to how
    "hadoop fs" works. If we are running in local mode, don't do anything.

    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop':
        if not any(map(lambda scheme: output_folder.startswith(scheme), \
                       ['hdfs:', 'file:', 's3:', 's3n:', '/'])):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder
コード例 #16
0
 def touch(self, fpath):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         os = self.fsHd.create(fp)
         os.close()
     except JException as ex:
         self.logger.error("Exception in HdfsUtil.touch({}): ex[{}]".format(
             fpath, ex))
コード例 #17
0
 def getFileStat(self, fpath):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.getFileStatus(fp)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.getFileStat({}): ex[{}]".format(
                 fpath, ex))
コード例 #18
0
 def setPerms(self, fpath, perms):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         fsPerm = FsPermission(perms)
         return self.fsHd.setPerms(fp, perms)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.setPerms({}): ex[{}]".format(fpath, ex))
コード例 #19
0
 def ls(self, fpath):
     ''' Returns a list of all files in a given dir.
         This file list can be very long and take lots of memory.
         Use lsIterator or lsFileIterator instead to minimize
         memory usage.
     '''
     p = Path(fpath)
     self.fileList = self.fsHd.listStatus(p)
     return self.fileList
コード例 #20
0
ファイル: tap.py プロジェクト: seanjensengrey/pycascading
def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.
    
    If we specified a relative path, prepend it with the home folder
    of the user on HDFS. If we are running in local mode, don't do anything.
    
    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.running_mode == 'hadoop':
        if output_folder == '' or \
        (output_folder[0 : 5] not in set(['hdfs:', 'file:']) and \
         output_folder[0] != '/'):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder
コード例 #21
0
 def setRep(self, fpath, replication):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.setReplication(fp, replication)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.setReplication({}): ex[{}]".format(
                 fpath, ex))
コード例 #22
0
    def mkdir(self, fpath, perms=755):
        try:
            p = Path(fpath)
            permObj = FsPermission("{}".format(perms))
            retVal = self.fsHd.mkdirs(p, permObj)
            if retVal == False:
                self.logger.error(
                    "HdfsUtil.mkdir({}): Failed to create dir.".format(
                        fpath, ex))

        except JException as ex:
            self.logger.error("Exception in HdfsUtil.mkdir({}): ex[{}]".format(
                fpath, ex))
コード例 #23
0
    def cp(self, srcfpath, trgfpath):
        "Copy data within the current HDFS."
        try:
            sp = []
            fileList = self.fsHd.globStatus(Path(srcfpath))
            if ((fileList is None) or (fileList.__len__() == 0)):
                # Emit and error: No files found for srcfPath
                None
            for sfp in fileList:
                sp.append(sfp.getPath())

            sfs = FileSystem.newInstance(self.hdfs.cHdfs)
            tp = Path(trgfpath)
            tfs = FileSystem.newInstance(self.hdfs.cHdfs)
            delSrc = False
            overWrite = True
            self.hdfs.fileUtil.copy(sfs, sp, tfs, tp, delSrc, overWrite,
                                    self.hdfs.cHdfs)
        except JException as ex:
            self.logger.error(
                "Exception in HdfsUtil.cp({} -> {}): ex[{}]".format(
                    srcfpath, trgfpath, ex))
コード例 #24
0
    def exists(self, fpath):
        try:
            # Create an empty file on HDFS
            sp = []
            fileList = self.fsHd.globStatus(Path(fpath))
            if ((fileList is None) or (fileList.__len__() == 0)):
                # Emit and error: No files found for srcfPath
                return False

            return True

        except JException as ex:
            self.logger.error(
                "Exception in HdfsUtil.exists({}): ex[{}]".format(fpath, ex))
コード例 #25
0
    def openRead(self, fPath):
        fpHdfs = Path(fPath)
        fsInput = self.fsHd.open(fpHdfs)

        reader = None
        pat = r'.*\.gz'
        match = re.search(pat, hst)
        if match == None:
            reader = BufferedReader(InputStreamReader(fsInput))
        else:
            # The file stream is in GZip format...
            reader = BufferedReader(InputStreamReader(
                GZIPInputStream(fsInput)))

        return reader
コード例 #26
0
ファイル: test_pigproxy.py プロジェクト: cbaenziger/squealer
    def testStore(self):
        from tempfile import mktemp
        tempdir = mktemp()
        outfile = tempdir + '/top_3_queries'
        args = [
            "n=3",
            "reducers=1",
            "input=" + self.INPUT_FILE,
            "output=" + outfile,
        ]
        proxy = PigProxy.from_file(self.PIG_SCRIPT, args)

        # By default all STORE and DUMP commands are removed
        proxy.unoverride("STORE")
        proxy.run_script()
        cluster = Cluster(proxy.pig.getPigContext())
        self.assert_(cluster.delete(Path(outfile)))
コード例 #27
0
    def __init__(self, hdfsCluster, fpath):
        self.hdfs = Hdfs(hdfsCluster)
        self.fsHd = self.hdfs.fileSystem

        fpHdfs = Path(fpath)
        fsInput = self.fsHd.open(fpHdfs)
        # The file has text so we want to use read the input stream via the BufferedReader.
        reader = BufferedReader(InputStreamReader(fsInput))
        self.lineCount = 0
        self.lines = []
        line = reader.readLine()
        while line is not None:
            # print line
            self.lines.append(line)
            self.lineCount = self.lineCount + 1
            if ((self.lineCount % 1000) == 0):
                print self.lineCount
            line = reader.readLine()
コード例 #28
0
    def rm(self, fpath, **kwargs):
        try:
            sp = []
            fileList = self.fsHd.globStatus(Path(fpath))
            if (fileList is None):
                # self.logger.warn("No Files found in: [{}]".format(fpath))
                return

            if 'recurse' in kwargs:
                recurse = kwargs['recurse']
            else:
                recurse = False

            for sfp in fileList:
                self.fsHd.delete(sfp.getPath(), recurse)

        except JException as ex:
            self.logger.error("Exception in HdfsUtil.rm({}): ex[{}]".format(
                fpath, ex))
コード例 #29
0
#  Licensed to the Apache Software Foundation (ASF) under one or more
#  contributor license agreements.  See the NOTICE file distributed with
#  this work for additional information regarding copyright ownership.
#  The ASF licenses this file to You under the Apache License, Version 2.0
#  (the "License"); you may not use this file except in compliance with
#  the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
from org.apache.hadoop.fs import Path  # Test for PIG-1824
p = Path('foo')


@outputSchemaFunction("squareSchema")
def square(num):
    if num == None:
        return None
    return ((num) * (num))


@schemaFunction("squareSchema")
def squareSchema(input):
    return input


@outputSchema("word:chararray")
コード例 #30
0
ファイル: cluster.py プロジェクト: MarkRoddy/squealer
 def copyContentFromLocalFile(self, content, dest_path, overwrite = True):
     file_path = Path(dest_path)
     fs = file_path.getFileSystem(self.configuration)
     if overwrite and fs.exists(file_path):
         fs.delete(file_path, True)
     self.createInputFile(fs, dest_path, content)
コード例 #31
0
 def copyContentFromLocalFile(self, content, dest_path, overwrite=True):
     file_path = Path(dest_path)
     fs = file_path.getFileSystem(self.configuration)
     if overwrite and fs.exists(file_path):
         fs.delete(file_path, True)
     self.createInputFile(fs, dest_path, content)
コード例 #32
0
from java.util import UUID
from org.apache.hadoop.fs import Path

print "Home dir is " + str(fs.homeDirectory)
print "Work dir is " + str(fs.workingDirectory)
print "/user exists " + str(fs.exists("/user"))

name = UUID.randomUUID().toString()
scriptName = "src/test/resources/test.properties"
fs.copyFromLocalFile(scriptName, name)
print Path(name).makeQualified(fs)

# use the shell
dir = "script-dir/"
if not fsh.test(dir):
	fsh.mkdir(dir)
	fsh.cp(name, dir)
	fsh.chmodr(700, dir)
	print "File content is " + str(fsh.cat(dir + name))


print str(fsh.ls(dir))
fsh.rmr(dir)
fs.getLength(name)
コード例 #33
0
ファイル: build.py プロジェクト: internetarchive/waimea
import sys
import time

startTime = time.time()

if len(sys.argv) != 2:
    raise sys.argv[0] + ' <basedir>'

# Get reference to the Hadoop FileSystem object.  Everything we do in
# this script that interacts with HDFS is through this object.
fs = FileSystem.get(Configuration())

# Make sure the requested collection exists.
collection = sys.argv[1]
collectionDir = Path(collection)

if not fs.exists(collectionDir):
    print '\nERROR: no collection directory: %s' % collectionDir
    System.exit(1)

# Check for "guard" file.  Like a semaphore, ensures that we don't try
# to update this collection while it's in the middle of being updated.
# Since file creation in HDFS is atomic, we don't check for the existence
# of the guardFile, rather we try to create it.  If the file already exists
# then fs.createNewFile() will return False
guardFile = Path(collectionDir, '_updating')
if not fs.createNewFile(guardFile):
    print '\nERROR: collection update already in progress: %s' % guardFile
    System.exit(1)
コード例 #34
0
    def openWrite(self, fPath):
        fp = Path(fPath)
        fsOutput = self.fsHd.create(fp)

        return fsOutput