def __init__(self, hdfsCluster): self.logger = Logger.getLogger("Hdfs") # self.logger.setLevel(Level.DEBUG) coreSite = "/etc/hadoop/conf/core-site.xml" hdfsSite = "/etc/hadoop/conf/hdfs-site.xml" hdfsCluster = hdfsCluster self.cHdfs = Configuration() self.cHdfs.addResource(Path(coreSite)) self.cHdfs.addResource(Path(hdfsSite)) self.cHdfs.set("fs.defaultFS", hdfsCluster) self.fileSystem = FileSystem.get(self.cHdfs) self.fileUtil = FileUtil()
def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None): """ :param hdfs_uri: hdfs://hadoop-name-node:port :param kerberos: optional, if kerberos authentication is needed :param kerberos_principal: optional, [email protected] :param keytab_file: optional, absolute path to keytab file """ self.logger = LoggerFactory.getLogger(self.__class__.__name__) self.logger.info("keytab_file: " + keytab_file) hdfs_conf = Configuration() if hdfs_uri.startswith('hdfs://'): hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri) elif hdfs_uri > "": self.logger.error("%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri) if kerberos: # init kerberos and keytab if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '': print "Kerberos Principal and Keytab File Name/Path are required!" hdfs_conf.set("hadoop.security.authentication", "kerberos") hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*") UserGroupInformation.setConfiguration(hdfs_conf) UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_file) self.fs = Hdfs.get(hdfs_conf) requests.packages.urllib3.disable_warnings()
class Hdfs: def __init__(self, hdfsCluster): self.logger = Logger.getLogger("Hdfs") # self.logger.setLevel(Level.DEBUG) coreSite = "/etc/hadoop/conf/core-site.xml" hdfsSite = "/etc/hadoop/conf/hdfs-site.xml" hdfsCluster = hdfsCluster self.cHdfs = Configuration() self.cHdfs.addResource(Path(coreSite)) self.cHdfs.addResource(Path(hdfsSite)) self.cHdfs.set("fs.defaultFS", hdfsCluster) self.fileSystem = FileSystem.get(self.cHdfs) self.fileUtil = FileUtil()
def __init__(self): ''' Init hadoop filesystem objects ''' logger.debug("Initiate hadoop.fs.FileSystem object") self.config = Configuration() self.file_system = fs.FileSystem.get(self.config)
def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None): """ :param hdfs_uri: hdfs://hadoop-name-node:port :param kerberos: optional, if kerberos authentication is needed :param kerberos_principal: optional, [email protected] :param keytab_file: optional, absolute path to keytab file """ self.logger = LoggerFactory.getLogger(self.__class__.__name__) self.logger.info("keytab_file: " + keytab_file) hdfs_conf = Configuration() if hdfs_uri.startswith('hdfs://'): hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri) elif hdfs_uri > "": self.logger.error("%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri) if kerberos: # init kerberos and keytab if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '': print "Kerberos Principal and Keytab File Name/Path are required!" hdfs_conf.set("hadoop.security.authentication", "kerberos") hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*") UserGroupInformation.setConfiguration(hdfs_conf) UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_file) self.fs = Hdfs.get(hdfs_conf) requests.packages.urllib3.disable_warnings() self.logger.info("Initiated SchemaUrlHelper")
def hdfs_folder_exists(self, folder): path = Path(folder) fs = path.getFileSystem(Configuration()) try: status = fs.getFileStatus(path) # TODO: there could be problems if it exists but is a simple file return status.isDir() except: return False
def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None): """ :param hdfs_uri: hdfs://hadoop-name-node:port :param kerberos: optional, if kerberos authentication is needed :param kerberos_principal: optional, [email protected] :param keytab_file: optional, user.keytab or ~/.kerberos/user.keytab """ self.logger = LoggerFactory.getLogger(self.__class__.__name__) hdfs_conf = Configuration() if hdfs_uri.startswith('hdfs://'): hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri) elif hdfs_uri > "": self.logger.error( "%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri) if kerberos == True: # init kerberos and keytab if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '': print "Kerberos Principal and Keytab File Name/Path are required!" keytab_path = keytab_file if keytab_file.startswith('/'): if os.path.exists(keytab_file): keytab_path = keytab_file print "Using keytab at %s" % keytab_path else: # try relative path all_locations = [ os.getcwd(), expanduser("~") + "/.ssh", expanduser("~") + "/.kerberos", expanduser("~") + "/.wherehows", os.getenv("APP_HOME"), os.getenv("WH_HOME") ] for loc in all_locations: if os.path.exists(loc + '/' + keytab_file): keytab_path = loc + '/' + keytab_file print "Using keytab at %s" % keytab_path break hdfs_conf.set("hadoop.security.authentication", "kerberos") hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*") UserGroupInformation.setConfiguration(hdfs_conf) UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_path) self.fs = Hdfs.get(hdfs_conf) requests.packages.urllib3.disable_warnings()
def expand_path_with_home(output_folder): """Prepend the home folder to a relative location on HDFS if necessary. If we specified a relative path, prepend it with the home folder of the user on HDFS. If we are running in local mode, don't do anything. Arguments: output_folder -- the absolute or relative path of the output HDFS folder """ import pycascading.pipe if pycascading.pipe.running_mode == 'hadoop': if output_folder == '' or \ (output_folder[0 : 5] not in set(['hdfs:', 'file:']) and \ output_folder[0] != '/'): fs = Path('/').getFileSystem(Configuration()) home_folder = fs.getHomeDirectory().toString() return home_folder + '/' + output_folder return output_folder
def expand_path_with_home(output_folder): """Prepend the home folder to a relative location on HDFS if necessary. Only if we specified a relative path and no scheme, prepend it with the home folder of the user on HDFS. This behavior is similar to how "hadoop fs" works. If we are running in local mode, don't do anything. Arguments: output_folder -- the absolute or relative path of the output HDFS folder """ import pycascading.pipe if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop': if not any(map(lambda scheme: output_folder.startswith(scheme), \ ['hdfs:', 'file:', 's3:', 's3n:', '/'])): fs = Path('/').getFileSystem(Configuration()) home_folder = fs.getHomeDirectory().toString() return home_folder + '/' + output_folder return output_folder
def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None): """ :param hdfs_uri: hdfs://hadoop-name-node:port :param kerberos: optional, if kerberos authentication is needed :param kerberos_principal: optional, [email protected] :param keytab_file: optional, user.keytab or ~/.kerberos/user.keytab """ self.logger = LoggerFactory.getLogger(self.__class__.__name__) hdfs_conf = Configuration() if hdfs_uri.startswith('hdfs://'): hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri) elif hdfs_uri > "": self.logger.error("%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri) if kerberos == True: # init kerberos and keytab if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '': print "Kerberos Principal and Keytab File Name/Path are required!" keytab_path = keytab_file if keytab_file.startswith('/'): if os.path.exists(keytab_file): keytab_path = keytab_file print "Using keytab at %s" % keytab_path else: # try relative path all_locations = [os.getcwd(), expanduser("~") + "/.ssh", expanduser("~") + "/.kerberos", expanduser("~") + "/.wherehows", os.getenv("APP_HOME"), os.getenv("WH_HOME")] for loc in all_locations: if os.path.exists(loc + '/' + keytab_file): keytab_path = loc + '/' + keytab_file print "Using keytab at %s" % keytab_path break hdfs_conf.set("hadoop.security.authentication", "kerberos") hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*") UserGroupInformation.setConfiguration(hdfs_conf) UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_path) self.fs = Hdfs.get(hdfs_conf) requests.packages.urllib3.disable_warnings()
def __init__(self): ''' Constructor ''' self.config = Configuration() self.file_system = fs.FileSystem.get(self.config)
from java.io import IOException from java.util import Properties from java.lang import SecurityException from java.lang import System import sys import time startTime = time.time() if len(sys.argv) != 2: raise sys.argv[0] + ' <basedir>' # Get reference to the Hadoop FileSystem object. Everything we do in # this script that interacts with HDFS is through this object. fs = FileSystem.get(Configuration()) # Make sure the requested collection exists. collection = sys.argv[1] collectionDir = Path(collection) if not fs.exists(collectionDir): print '\nERROR: no collection directory: %s' % collectionDir System.exit(1) # Check for "guard" file. Like a semaphore, ensures that we don't try # to update this collection while it's in the middle of being updated. # Since file creation in HDFS is atomic, we don't check for the existence # of the guardFile, rather we try to create it. If the file already exists # then fs.createNewFile() will return False guardFile = Path(collectionDir, '_updating')