def run(self): """ Options --onetime[=1] run once, then exit; use if called by an external scheduler. --nowait[=1] run immediately without waiting for scheduler to determine execution. """ # exit if __init__ didn't find a valid project file if not self.project_name: return # display application banner app_name = script_name() print(f'UDP {app_name.title()} {self.project_name}') copyright_year = f'{now():%Y}' copyright_message = f'Copyright (c) 2018-{copyright_year} Alterra Mountain Company, Inc.' print(copyright_message) # make sure root sessions folder exists create_folder(self.session_folder) # since we start logging before we read config/options we log to known path vs dataset specific path log_setup( log_file_name=f'{self.session_folder}/{self.project_name}.log') log_session_info() # common setup self.setup() # application specific startup logic self.start() # scheduling behavior based on --onetime, --nowait option if self.option('onetime') == '1': # one-time run; use when this script is being called by an external scheduler logger.info('Option(onetime=1): executing once') self.main() else: if self.option('nowait') == '1': # no-wait option; execute immediately without waiting for scheduler to initiate logger.info( 'Option(nowait=1): executing immediately, then following regular schedule' ) self.main() # standard wait for scheduled time slot and run logic while True: self.progress_message('waiting for next job ...') if self.schedule.wait(): self.main() if self.option('scheduled_onetime') == '1': logger.info( 'Option(scheduled_onetime=1): ran once at first scheduled timeslot' ) break else: break self.cleanup()
def main(): common.setup() if len(sys.argv) != 3 or sys.argv[1] != '-f': print 'USAGE: {0} -f hdfs_path'.format(common.script_name()) print 'WARNING: This deletes hdfs_path from your cluster.' sys.exit(1) nix = sys.argv[2] common.send_coordinator('/job/clean', {'path': nix}, verify=True)
def main(): common.setup() usage = ( 'USAGE: {0} [1,2,3]\nPhase 1 generates data, phase 2 sorts it, and ' 'phase 3 validates it.'.format(common.script_name())) if len(sys.argv) != 2: print usage sys.exit(1) # Detect if the user has the jar jar = 'hadoop-examples-{0}.jar'.format(cfg.hadoop_version) if not os.path.exists(jar): print( 'You need {0}, which contains the Terasort MapReduce job, in your ' 'current directory.').format(jar) print( 'Please run the following commands to get it, then re-run this ' 'script.') print tarball = '{0}.tar.gz'.format(cfg.hadoop_fn) print 'wget http://{0}/{1}/{2}'.format(cfg.hadoop_url, cfg.hadoop_fn, tarball) print 'tar xzf {0}'.format(tarball) print 'cp {0}/{1} .'.format(cfg.hadoop_fn, jar) sys.exit(1) # TODO Figure out number of tasks programatically. The defaults are sometimes # 1. num_tasks = 100 phase = sys.argv[1] job_args = [] if phase == '1': gigabytes = 1000 # Convert GB->bytes, then divide by 100 hundred_bytes = gigabytes * (10**7) job_args = [ 'teragen', '-Dmapred.map.tasks={0}'.format(num_tasks), hundred_bytes, '/job_input/terasort' ] elif phase == '2': # The terasort driver automatically uses as many map tasks as possible. job_args = [ 'terasort', '-Dmapred.reduce.tasks={0}'.format(num_tasks), '/job_input/terasort', '/job_output/terasort' ] elif phase == '3': job_args = [ 'teravalidate', '/job_output/terasort', '/job_output/teravalidate' ] else: print usage sys.exit(1) common.start_job(jar, job_args)
def main(): common.setup() if len(sys.argv) != 2: print 'USAGE: {0} num_slaves'.format(common.script_name()) sys.exit(1) num_slaves = int(sys.argv[1]) print 'Adding {0} slaves...'.format(num_slaves) common.send_coordinator('/hadoop/add_slaves', {'num_slaves': num_slaves})
def main(): common.setup() usage = "USAGE: {0} [1,2,3]\nPhase 1 generates data, phase 2 sorts it, and " "phase 3 validates it.".format( common.script_name() ) if len(sys.argv) != 2: print usage sys.exit(1) # Detect if the user has the jar jar = "hadoop-examples-{0}.jar".format(cfg.hadoop_version) if not os.path.exists(jar): print ("You need {0}, which contains the Terasort MapReduce job, in your " "current directory.").format(jar) print ("Please run the following commands to get it, then re-run this " "script.") print tarball = "{0}.tar.gz".format(cfg.hadoop_fn) print "wget http://{0}/{1}/{2}".format(cfg.hadoop_url, cfg.hadoop_fn, tarball) print "tar xzf {0}".format(tarball) print "cp {0}/{1} .".format(cfg.hadoop_fn, jar) sys.exit(1) # TODO Figure out number of tasks programatically. The defaults are sometimes # 1. num_tasks = 100 phase = sys.argv[1] job_args = [] if phase == "1": gigabytes = 1000 # Convert GB->bytes, then divide by 100 hundred_bytes = gigabytes * (10 ** 7) job_args = ["teragen", "-Dmapred.map.tasks={0}".format(num_tasks), hundred_bytes, "/job_input/terasort"] elif phase == "2": # The terasort driver automatically uses as many map tasks as possible. job_args = [ "terasort", "-Dmapred.reduce.tasks={0}".format(num_tasks), "/job_input/terasort", "/job_output/terasort", ] elif phase == "3": job_args = ["teravalidate", "/job_output/terasort", "/job_output/teravalidate"] else: print usage sys.exit(1) common.start_job(jar, job_args)
def main(): common.setup() if len(sys.argv) != 2: print 'USAGE: {0} num_slaves'.format(common.script_name()) sys.exit(1) num_slaves = int(sys.argv[1]) if num_slaves < cfg.needed_slaves: print('Hadoop needs at least {0} slaves for filesystem ' 'replication').format(cfg.needed_slaves) sys.exit(1) print 'Setting up Hadoop...' common.send_coordinator('/hadoop/launch', {'num_slaves': num_slaves}) common.wait_for_hadoop()
def main(): common.setup() if len(sys.argv) != 2: print 'USAGE: {0} num_slaves'.format(common.script_name()) sys.exit(1) num_slaves = int(sys.argv[1]) if num_slaves < cfg.needed_slaves: print ('Hadoop needs at least {0} slaves for filesystem ' 'replication').format(cfg.needed_slaves) sys.exit(1) print 'Setting up Hadoop...' common.send_coordinator('/hadoop/launch', {'num_slaves': num_slaves}) common.wait_for_hadoop()
def main(): common.setup() usage = 'USAGE: {0} hdfs_src gs_dst'.format(common.script_name()) if len(sys.argv) != 3: print usage sys.exit(1) src = sys.argv[1] if sys.argv[2].startswith('gs://') or not sys.argv[2].startswith('/'): print usage print ('gs_dst should be of the form /path/to/object. gs://{0} will be ' 'prefixed for you.').format(cfg.gs_bucket) sys.exit(1) dst = 'gs://{0}{1}'.format(cfg.gs_bucket, sys.argv[2]) common.download(src, dst)
def main(): common.setup() usage = 'USAGE: {0} hdfs_src gs_dst'.format(common.script_name()) if len(sys.argv) != 3: print usage sys.exit(1) src = sys.argv[1] if sys.argv[2].startswith('gs://') or not sys.argv[2].startswith('/'): print usage print( 'gs_dst should be of the form /path/to/object. gs://{0} will be ' 'prefixed for you.').format(cfg.gs_bucket) sys.exit(1) dst = 'gs://{0}{1}'.format(cfg.gs_bucket, sys.argv[2]) common.download(src, dst)
def __init__(self, project_file=None): # session folder (acts as root path for job specific folders) self.session_folder = '../sessions' # configuration engines self.config = None self.option = None # project metadata self.project = None self.namespace = None # project resources self.database = None self.schedule = None # project dataset specific working folders self.state_folder = None self.work_folder = None self.publish_folder = None # project database connections (db_conn) self.source_db_conn = None self.target_db_conn = None # project file and name self.project_file = '' self.project_name = '' # if optional project file supplied use it; otherwise try command line if project_file: self.project_file = project_file elif len(sys.argv) > 1: self.project_file = sys.argv[1] # make sure we have a valid project file app_name = script_name() if not self.project_file: print(f'{app_name}: error - must specify project file') elif not is_file(f'../conf/{self.project_file}'): print( f'{app_name}: error - project file not found ({project_file})') else: # project file controls configuration self.project_name = just_file_stem(self.project_file)
def log_exception(e): log_file = script_name() + '.log' with open(log_file, 'a') as output_stream: output_stream.write(f'{datetime.datetime.now()}\n{e}\n')