def transfer_to_hdfs(self): """Copy any necessary input files to HDFS. This transfers both common exe/setup (if self.share_exe_setup == True), and the individual files required by each Job. """ # Do copying of exe/setup script here instead of through Jobs if only # 1 instance required on HDFS. if self.share_exe_setup: if self.copy_exe: log.info('Copying %s -->> %s', self.exe, self.hdfs_store) cp_hdfs(self.exe, self.hdfs_store) if self.setup_script: log.info('Copying %s -->> %s', self.setup_script, self.hdfs_store) cp_hdfs(self.setup_script, self.hdfs_store) # Transfer common input files for ifile in self.common_input_file_mirrors: log.info('Copying %s -->> %s', ifile.original, ifile.hdfs) cp_hdfs(ifile.original, ifile.hdfs) # Get each job to transfer their necessary files for job in self.jobs.itervalues(): job.transfer_to_hdfs()
def transfer_to_hdfs(self): """Transfer files across to HDFS. Auto-creates HDFS mirror dir if it doesn't exist, but only if there are 1 or more files to transfer. Will not transfer exe or setup script if manager.share_exe_setup is True. That is left for the manager to do. """ # skip the exe.setup script - the JobSet should handle this itself. files_to_transfer = [] for ifile in self.input_file_mirrors: if ((ifile.original == ifile.hdfs) or (self.manager.share_exe_setup and ifile.original in [self.manager.exe, self.manager.setup_script])): continue files_to_transfer.append(ifile) if len(files_to_transfer) > 0: check_dir_create(self.hdfs_mirror_dir) for ifile in files_to_transfer: log.info('Copying %s -->> %s', ifile.original, ifile.hdfs) cp_hdfs(ifile.original, ifile.hdfs)