Esempi in Python per Path, esempi in Python per org.apache.hadoop.fs.Path

Esempio n. 1

0

Mostra file

def main(args):
    conf = JobConf(WordCountMap)
    conf.setJobName("wordcount")

    conf.setOutputKeyClass(Text)
    conf.setOutputValueClass(IntWritable)

    conf.setMapperClass(WordCountMap)
    conf.setCombinerClass(Summer)
    conf.setReducerClass(Summer)
    try:
        flags, other_args = getopt.getopt(args[1:], "m:r:")
    except getopt.GetoptError:
        printUsage(1)
    if len(other_args) != 2:
        printUsage(1)

    for f, v in flags:
        if f == "-m":
            conf.setNumMapTasks(int(v))
        elif f == "-r":
            conf.setNumReduceTasks(int(v))
    conf.setInputPath(Path(other_args[0]))
    conf.setOutputPath(Path(other_args[1]))
    JobClient.runJob(conf)

Esempio n. 2

0

Mostra file

def main(args):
    if len(args) < 6:
        printUsage(1);

    inDir = args[1];
    outDir = args[2];
    numOfReducers = int(args[3]);
    theInputFormat = args[4];
    specFile = args[5];
                                        
    print "numOfReducers: ", numOfReducers, "theInputFormat: ", theInputFormat, "specFile: ", specFile

    conf = JobConf(AbacusMapper);
    conf.setJobName("recordcount");
    conf.addDefaultResource(Path(specFile));
 
    if theInputFormat=="textinputformat":
        conf.setInputFormat(TextInputFormat);
    else:
        conf.setInputFormat(SequenceFileInputFormat);
    conf.setOutputFormat(TextOutputFormat);
    conf.setMapOutputKeyClass(Text);
    conf.setMapOutputValueClass(Text);
    conf.setOutputKeyClass(Text);
    conf.setOutputValueClass(Text);
    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(numOfReducers);

    conf.setMapperClass(AbacusMapper);        
    conf.setCombinerClass(AbacusCombiner);
    conf.setReducerClass(AbacusReducer);
    conf.setInputPath(Path(args[1]))
    conf.setOutputPath(Path(args[2]))

    JobClient.runJob(conf);

Esempio n. 3

0

Mostra file

File: tap.py Progetto: butzeb/pycascading

 def hdfs_folder_exists(self, folder):
     path = Path(folder)
     fs = path.getFileSystem(Configuration())
     try:
         status = fs.getFileStatus(path)
         # TODO: there could be problems if it exists but is a simple file
         return status.isDir()
     except:
         return False

Esempio n. 4

0

Mostra file

File: tap.py Progetto: seanjensengrey/pycascading

 def hdfs_folder_exists(self, folder):
     path = Path(folder)
     fs = path.getFileSystem(Configuration())
     try:
         status = fs.getFileStatus(path)
         # TODO: there could be problems if it exists but is a simple file
         return status.isDir()
     except:
         return False

Esempio n. 5

0

Mostra file

 def createInputFile(self, fs, fileName, input_data):
     if (fs.exists(Path(fileName))):
         raise IOException("File " + fileName +
                           " already exists on the minicluster")
     stream = fs.create(Path(fileName))
     pw = PrintWriter(OutputStreamWriter(stream, "UTF-8"))
     for i in xrange(len(input_data)):
         pw.println(input_data[i])
     pw.close()

Esempio n. 6

0

Mostra file

 def mv(self, srcfpath, trgfpath):
     try:
         sp = Path(srcfpath)
         tp = Path(trgfpath)
         #  Needs work...
         self.fsHd.rename(sp, tp)
     except JException as ex:
         self.logger.error("Exception in HdfsUtil.mv({}): ex[{}]".format(
             fpath, ex))

Esempio n. 7

0

Mostra file

File: hdfs.py Progetto: crankycoder/sandbox

    def _open_write(self, path):
        p = Path(path)

        dirname = p.getParent()
        self._fs.mkdirs(dirname)

        key = self._key
        value = self._value

        writer = SequenceFile.createWriter(self._fs, self._conf, p, key.getClass(), value.getClass())
        return WrappedWriter(writer, key, value)

Esempio n. 8

0

Mostra file

    def __init__(self, hdfsCluster):
        self.logger = Logger.getLogger("Hdfs")
        # self.logger.setLevel(Level.DEBUG)

        coreSite = "/etc/hadoop/conf/core-site.xml"
        hdfsSite = "/etc/hadoop/conf/hdfs-site.xml"
        hdfsCluster = hdfsCluster
        self.cHdfs = Configuration()
        self.cHdfs.addResource(Path(coreSite))
        self.cHdfs.addResource(Path(hdfsSite))
        self.cHdfs.set("fs.defaultFS", hdfsCluster)
        self.fileSystem = FileSystem.get(self.cHdfs)
        self.fileUtil = FileUtil()

Esempio n. 9

0

Mostra file

 def lsIterator(self, fpath):
     ''' Returns an iterator that returns files and dirs
         w/in a given dir path (no recursion).
     '''
     p = Path(fpath)
     self.lsListIterator = self.fsHd.listLocatedStatus(p)
     return self.lsListIterator

Esempio n. 10

0

Mostra file

 def lsFileIterator(self, fpath, recurse=False):
     ''' Returns an iterator that returns files (only)
         w/in a given dir path (w/the option for recursion).
     '''
     p = Path(fpath)
     self.fileListIterator = self.fsHd.listFiles(p, recurse)
     return self.fileListIterator

Esempio n. 11

0

Mostra file

 def isDir(self, fpath):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.isDirectory(fp)
     except JException as ex:
         self.logger.error("Exception in HdfsUtil.isDir({}): ex[{}]".format(
             fpath, ex))

Esempio n. 12

0

Mostra file

 def setOwner(self, fpath, user, group):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.setOwner(fp, user, group)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.setOwner({}): ex[{}]".format(fpath, ex))

Esempio n. 13

0

Mostra file

File: tap.py Progetto: seanjensengrey/pycascading

def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.
    
    If we specified a relative path, prepend it with the home folder
    of the user on HDFS. If we are running in local mode, don't do anything.
    
    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe

    if pycascading.pipe.running_mode == "hadoop":
        if output_folder == "" or (output_folder[0:5] not in set(["hdfs:", "file:"]) and output_folder[0] != "/"):
            fs = Path("/").getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + "/" + output_folder
    return output_folder

Esempio n. 14

0

Mostra file

def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.

    Only if we specified a relative path and no scheme, prepend it with the
    home folder of the user on HDFS. This behavior is similar to how
    "hadoop fs" works. If we are running in local mode, don't do anything.

    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop':
        if not any(map(lambda scheme: output_folder.startswith(scheme), \
                       ['hdfs:', 'file:', 's3:', 's3n:', '/'])):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder

Esempio n. 15

0

Mostra file

File: tap.py Progetto: ArturFis/pycascading

def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.

    Only if we specified a relative path and no scheme, prepend it with the
    home folder of the user on HDFS. This behavior is similar to how
    "hadoop fs" works. If we are running in local mode, don't do anything.

    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop':
        if not any(map(lambda scheme: output_folder.startswith(scheme), \
                       ['hdfs:', 'file:', 's3:', 's3n:', '/'])):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder

Esempio n. 16

0

Mostra file

 def touch(self, fpath):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         os = self.fsHd.create(fp)
         os.close()
     except JException as ex:
         self.logger.error("Exception in HdfsUtil.touch({}): ex[{}]".format(
             fpath, ex))

Esempio n. 17

0

Mostra file

 def getFileStat(self, fpath):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.getFileStatus(fp)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.getFileStat({}): ex[{}]".format(
                 fpath, ex))

Esempio n. 18

0

Mostra file

 def setPerms(self, fpath, perms):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         fsPerm = FsPermission(perms)
         return self.fsHd.setPerms(fp, perms)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.setPerms({}): ex[{}]".format(fpath, ex))

Esempio n. 19

0

Mostra file

 def ls(self, fpath):
     ''' Returns a list of all files in a given dir.
         This file list can be very long and take lots of memory.
         Use lsIterator or lsFileIterator instead to minimize
         memory usage.
     '''
     p = Path(fpath)
     self.fileList = self.fsHd.listStatus(p)
     return self.fileList

Esempio n. 20

0

Mostra file

File: tap.py Progetto: seanjensengrey/pycascading

def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.
    
    If we specified a relative path, prepend it with the home folder
    of the user on HDFS. If we are running in local mode, don't do anything.
    
    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.running_mode == 'hadoop':
        if output_folder == '' or \
        (output_folder[0 : 5] not in set(['hdfs:', 'file:']) and \
         output_folder[0] != '/'):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder

Esempio n. 21

0

Mostra file

 def setRep(self, fpath, replication):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.setReplication(fp, replication)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.setReplication({}): ex[{}]".format(
                 fpath, ex))

Esempio n. 22

0

Mostra file

    def mkdir(self, fpath, perms=755):
        try:
            p = Path(fpath)
            permObj = FsPermission("{}".format(perms))
            retVal = self.fsHd.mkdirs(p, permObj)
            if retVal == False:
                self.logger.error(
                    "HdfsUtil.mkdir({}): Failed to create dir.".format(
                        fpath, ex))

        except JException as ex:
            self.logger.error("Exception in HdfsUtil.mkdir({}): ex[{}]".format(
                fpath, ex))

Esempio n. 23

0

Mostra file

    def cp(self, srcfpath, trgfpath):
        "Copy data within the current HDFS."
        try:
            sp = []
            fileList = self.fsHd.globStatus(Path(srcfpath))
            if ((fileList is None) or (fileList.__len__() == 0)):
                # Emit and error: No files found for srcfPath
                None
            for sfp in fileList:
                sp.append(sfp.getPath())

            sfs = FileSystem.newInstance(self.hdfs.cHdfs)
            tp = Path(trgfpath)
            tfs = FileSystem.newInstance(self.hdfs.cHdfs)
            delSrc = False
            overWrite = True
            self.hdfs.fileUtil.copy(sfs, sp, tfs, tp, delSrc, overWrite,
                                    self.hdfs.cHdfs)
        except JException as ex:
            self.logger.error(
                "Exception in HdfsUtil.cp({} -> {}): ex[{}]".format(
                    srcfpath, trgfpath, ex))

Esempio n. 24

0

Mostra file

    def exists(self, fpath):
        try:
            # Create an empty file on HDFS
            sp = []
            fileList = self.fsHd.globStatus(Path(fpath))
            if ((fileList is None) or (fileList.__len__() == 0)):
                # Emit and error: No files found for srcfPath
                return False

            return True

        except JException as ex:
            self.logger.error(
                "Exception in HdfsUtil.exists({}): ex[{}]".format(fpath, ex))

Esempio n. 25

0

Mostra file

    def openRead(self, fPath):
        fpHdfs = Path(fPath)
        fsInput = self.fsHd.open(fpHdfs)

        reader = None
        pat = r'.*\.gz'
        match = re.search(pat, hst)
        if match == None:
            reader = BufferedReader(InputStreamReader(fsInput))
        else:
            # The file stream is in GZip format...
            reader = BufferedReader(InputStreamReader(
                GZIPInputStream(fsInput)))

        return reader

Esempio n. 26

0

Mostra file

File: test_pigproxy.py Progetto: cbaenziger/squealer

    def testStore(self):
        from tempfile import mktemp
        tempdir = mktemp()
        outfile = tempdir + '/top_3_queries'
        args = [
            "n=3",
            "reducers=1",
            "input=" + self.INPUT_FILE,
            "output=" + outfile,
        ]
        proxy = PigProxy.from_file(self.PIG_SCRIPT, args)

        # By default all STORE and DUMP commands are removed
        proxy.unoverride("STORE")
        proxy.run_script()
        cluster = Cluster(proxy.pig.getPigContext())
        self.assert_(cluster.delete(Path(outfile)))

Esempio n. 27

0

Mostra file

    def __init__(self, hdfsCluster, fpath):
        self.hdfs = Hdfs(hdfsCluster)
        self.fsHd = self.hdfs.fileSystem

        fpHdfs = Path(fpath)
        fsInput = self.fsHd.open(fpHdfs)
        # The file has text so we want to use read the input stream via the BufferedReader.
        reader = BufferedReader(InputStreamReader(fsInput))
        self.lineCount = 0
        self.lines = []
        line = reader.readLine()
        while line is not None:
            # print line
            self.lines.append(line)
            self.lineCount = self.lineCount + 1
            if ((self.lineCount % 1000) == 0):
                print self.lineCount
            line = reader.readLine()

Esempio n. 28

0

Mostra file

    def rm(self, fpath, **kwargs):
        try:
            sp = []
            fileList = self.fsHd.globStatus(Path(fpath))
            if (fileList is None):
                # self.logger.warn("No Files found in: [{}]".format(fpath))
                return

            if 'recurse' in kwargs:
                recurse = kwargs['recurse']
            else:
                recurse = False

            for sfp in fileList:
                self.fsHd.delete(sfp.getPath(), recurse)

        except JException as ex:
            self.logger.error("Exception in HdfsUtil.rm({}): ex[{}]".format(
                fpath, ex))

Esempio n. 29

0

Mostra file

#  Licensed to the Apache Software Foundation (ASF) under one or more
#  contributor license agreements.  See the NOTICE file distributed with
#  this work for additional information regarding copyright ownership.
#  The ASF licenses this file to You under the Apache License, Version 2.0
#  (the "License"); you may not use this file except in compliance with
#  the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
from org.apache.hadoop.fs import Path  # Test for PIG-1824
p = Path('foo')


@outputSchemaFunction("squareSchema")
def square(num):
    if num == None:
        return None
    return ((num) * (num))


@schemaFunction("squareSchema")
def squareSchema(input):
    return input


@outputSchema("word:chararray")

Esempio n. 30

0

Mostra file

File: cluster.py Progetto: MarkRoddy/squealer

 def copyContentFromLocalFile(self, content, dest_path, overwrite = True):
     file_path = Path(dest_path)
     fs = file_path.getFileSystem(self.configuration)
     if overwrite and fs.exists(file_path):
         fs.delete(file_path, True)
     self.createInputFile(fs, dest_path, content)

Esempio n. 31

0

Mostra file

 def copyContentFromLocalFile(self, content, dest_path, overwrite=True):
     file_path = Path(dest_path)
     fs = file_path.getFileSystem(self.configuration)
     if overwrite and fs.exists(file_path):
         fs.delete(file_path, True)
     self.createInputFile(fs, dest_path, content)

Esempio n. 32

0

Mostra file

File: basic-script.py Progetto: Naplues/BugTypeBasedIRBL

from java.util import UUID
from org.apache.hadoop.fs import Path

print "Home dir is " + str(fs.homeDirectory)
print "Work dir is " + str(fs.workingDirectory)
print "/user exists " + str(fs.exists("/user"))

name = UUID.randomUUID().toString()
scriptName = "src/test/resources/test.properties"
fs.copyFromLocalFile(scriptName, name)
print Path(name).makeQualified(fs)

# use the shell
dir = "script-dir/"
if not fsh.test(dir):
	fsh.mkdir(dir)
	fsh.cp(name, dir)
	fsh.chmodr(700, dir)
	print "File content is " + str(fsh.cat(dir + name))


print str(fsh.ls(dir))
fsh.rmr(dir)
fs.getLength(name)

Esempio n. 33

0

Mostra file

File: build.py Progetto: internetarchive/waimea

import sys
import time

startTime = time.time()

if len(sys.argv) != 2:
    raise sys.argv[0] + ' <basedir>'

# Get reference to the Hadoop FileSystem object.  Everything we do in
# this script that interacts with HDFS is through this object.
fs = FileSystem.get(Configuration())

# Make sure the requested collection exists.
collection = sys.argv[1]
collectionDir = Path(collection)

if not fs.exists(collectionDir):
    print '\nERROR: no collection directory: %s' % collectionDir
    System.exit(1)

# Check for "guard" file.  Like a semaphore, ensures that we don't try
# to update this collection while it's in the middle of being updated.
# Since file creation in HDFS is atomic, we don't check for the existence
# of the guardFile, rather we try to create it.  If the file already exists
# then fs.createNewFile() will return False
guardFile = Path(collectionDir, '_updating')
if not fs.createNewFile(guardFile):
    print '\nERROR: collection update already in progress: %s' % guardFile
    System.exit(1)

Esempio n. 34

0

Mostra file

    def openWrite(self, fPath):
        fp = Path(fPath)
        fsOutput = self.fsHd.create(fp)

        return fsOutput