コード例 #1
0
    def run(self):
        """
        Options
        --onetime[=1] run once, then exit; use if called by an external scheduler.
        --nowait[=1] run immediately without waiting for scheduler to determine execution.
        """

        # exit if __init__ didn't find a valid project file
        if not self.project_name:
            return

        # display application banner
        app_name = script_name()
        print(f'UDP {app_name.title()} {self.project_name}')
        copyright_year = f'{now():%Y}'
        copyright_message = f'Copyright (c) 2018-{copyright_year} Alterra Mountain Company, Inc.'
        print(copyright_message)

        # make sure root sessions folder exists
        create_folder(self.session_folder)

        # since we start logging before we read config/options we log to known path vs dataset specific path
        log_setup(
            log_file_name=f'{self.session_folder}/{self.project_name}.log')
        log_session_info()

        # common setup
        self.setup()

        # application specific startup logic
        self.start()

        # scheduling behavior based on --onetime, --nowait option
        if self.option('onetime') == '1':
            # one-time run; use when this script is being called by an external scheduler
            logger.info('Option(onetime=1): executing once')
            self.main()
        else:
            if self.option('nowait') == '1':
                # no-wait option; execute immediately without waiting for scheduler to initiate
                logger.info(
                    'Option(nowait=1): executing immediately, then following regular schedule'
                )
                self.main()

            # standard wait for scheduled time slot and run logic
            while True:
                self.progress_message('waiting for next job ...')
                if self.schedule.wait():
                    self.main()
                    if self.option('scheduled_onetime') == '1':
                        logger.info(
                            'Option(scheduled_onetime=1): ran once at first scheduled timeslot'
                        )
                        break
                else:
                    break

        self.cleanup()
コード例 #2
0
def main():
    common.setup()
    if len(sys.argv) != 3 or sys.argv[1] != '-f':
        print 'USAGE: {0} -f hdfs_path'.format(common.script_name())
        print 'WARNING: This deletes hdfs_path from your cluster.'
        sys.exit(1)

    nix = sys.argv[2]
    common.send_coordinator('/job/clean', {'path': nix}, verify=True)
コード例 #3
0
def main():
  common.setup()
  if len(sys.argv) != 3 or sys.argv[1] != '-f':
    print 'USAGE: {0} -f hdfs_path'.format(common.script_name())
    print 'WARNING: This deletes hdfs_path from your cluster.'
    sys.exit(1)

  nix = sys.argv[2]
  common.send_coordinator('/job/clean', {'path': nix}, verify=True)
コード例 #4
0
def main():
    common.setup()
    usage = (
        'USAGE: {0} [1,2,3]\nPhase 1 generates data, phase 2 sorts it, and '
        'phase 3 validates it.'.format(common.script_name()))

    if len(sys.argv) != 2:
        print usage
        sys.exit(1)

    # Detect if the user has the jar
    jar = 'hadoop-examples-{0}.jar'.format(cfg.hadoop_version)
    if not os.path.exists(jar):
        print(
            'You need {0}, which contains the Terasort MapReduce job, in your '
            'current directory.').format(jar)
        print(
            'Please run the following commands to get it, then re-run this '
            'script.')
        print
        tarball = '{0}.tar.gz'.format(cfg.hadoop_fn)
        print 'wget http://{0}/{1}/{2}'.format(cfg.hadoop_url, cfg.hadoop_fn,
                                               tarball)
        print 'tar xzf {0}'.format(tarball)
        print 'cp {0}/{1} .'.format(cfg.hadoop_fn, jar)
        sys.exit(1)

    # TODO Figure out number of tasks programatically. The defaults are sometimes
    # 1.
    num_tasks = 100
    phase = sys.argv[1]

    job_args = []
    if phase == '1':
        gigabytes = 1000
        # Convert GB->bytes, then divide by 100
        hundred_bytes = gigabytes * (10**7)

        job_args = [
            'teragen', '-Dmapred.map.tasks={0}'.format(num_tasks),
            hundred_bytes, '/job_input/terasort'
        ]
    elif phase == '2':
        # The terasort driver automatically uses as many map tasks as possible.
        job_args = [
            'terasort', '-Dmapred.reduce.tasks={0}'.format(num_tasks),
            '/job_input/terasort', '/job_output/terasort'
        ]
    elif phase == '3':
        job_args = [
            'teravalidate', '/job_output/terasort', '/job_output/teravalidate'
        ]
    else:
        print usage
        sys.exit(1)

    common.start_job(jar, job_args)
コード例 #5
0
def main():
  common.setup()

  if len(sys.argv) != 2:
    print 'USAGE: {0} num_slaves'.format(common.script_name())
    sys.exit(1)

  num_slaves = int(sys.argv[1])

  print 'Adding {0} slaves...'.format(num_slaves)
  common.send_coordinator('/hadoop/add_slaves', {'num_slaves': num_slaves})
コード例 #6
0
def main():
    common.setup()

    if len(sys.argv) != 2:
        print 'USAGE: {0} num_slaves'.format(common.script_name())
        sys.exit(1)

    num_slaves = int(sys.argv[1])

    print 'Adding {0} slaves...'.format(num_slaves)
    common.send_coordinator('/hadoop/add_slaves', {'num_slaves': num_slaves})
コード例 #7
0
def main():
    common.setup()
    usage = "USAGE: {0} [1,2,3]\nPhase 1 generates data, phase 2 sorts it, and " "phase 3 validates it.".format(
        common.script_name()
    )

    if len(sys.argv) != 2:
        print usage
        sys.exit(1)

    # Detect if the user has the jar
    jar = "hadoop-examples-{0}.jar".format(cfg.hadoop_version)
    if not os.path.exists(jar):
        print ("You need {0}, which contains the Terasort MapReduce job, in your " "current directory.").format(jar)
        print ("Please run the following commands to get it, then re-run this " "script.")
        print
        tarball = "{0}.tar.gz".format(cfg.hadoop_fn)
        print "wget http://{0}/{1}/{2}".format(cfg.hadoop_url, cfg.hadoop_fn, tarball)
        print "tar xzf {0}".format(tarball)
        print "cp {0}/{1} .".format(cfg.hadoop_fn, jar)
        sys.exit(1)

    # TODO Figure out number of tasks programatically. The defaults are sometimes
    # 1.
    num_tasks = 100
    phase = sys.argv[1]

    job_args = []
    if phase == "1":
        gigabytes = 1000
        # Convert GB->bytes, then divide by 100
        hundred_bytes = gigabytes * (10 ** 7)

        job_args = ["teragen", "-Dmapred.map.tasks={0}".format(num_tasks), hundred_bytes, "/job_input/terasort"]
    elif phase == "2":
        # The terasort driver automatically uses as many map tasks as possible.
        job_args = [
            "terasort",
            "-Dmapred.reduce.tasks={0}".format(num_tasks),
            "/job_input/terasort",
            "/job_output/terasort",
        ]
    elif phase == "3":
        job_args = ["teravalidate", "/job_output/terasort", "/job_output/teravalidate"]
    else:
        print usage
        sys.exit(1)

    common.start_job(jar, job_args)
コード例 #8
0
def main():
    common.setup()

    if len(sys.argv) != 2:
        print 'USAGE: {0} num_slaves'.format(common.script_name())
        sys.exit(1)
    num_slaves = int(sys.argv[1])
    if num_slaves < cfg.needed_slaves:
        print('Hadoop needs at least {0} slaves for filesystem '
              'replication').format(cfg.needed_slaves)
        sys.exit(1)

    print 'Setting up Hadoop...'
    common.send_coordinator('/hadoop/launch', {'num_slaves': num_slaves})
    common.wait_for_hadoop()
コード例 #9
0
def main():
  common.setup()

  if len(sys.argv) != 2:
    print 'USAGE: {0} num_slaves'.format(common.script_name())
    sys.exit(1)
  num_slaves = int(sys.argv[1])
  if num_slaves < cfg.needed_slaves:
    print ('Hadoop needs at least {0} slaves for filesystem '
           'replication').format(cfg.needed_slaves)
    sys.exit(1)

  print 'Setting up Hadoop...'
  common.send_coordinator('/hadoop/launch', {'num_slaves': num_slaves})
  common.wait_for_hadoop()
def main():
  common.setup()
  usage = 'USAGE: {0} hdfs_src gs_dst'.format(common.script_name())
  if len(sys.argv) != 3:
    print usage
    sys.exit(1)

  src = sys.argv[1]
  if sys.argv[2].startswith('gs://') or not sys.argv[2].startswith('/'):
    print usage
    print ('gs_dst should be of the form /path/to/object. gs://{0} will be '
           'prefixed for you.').format(cfg.gs_bucket)
    sys.exit(1)
  dst = 'gs://{0}{1}'.format(cfg.gs_bucket, sys.argv[2])

  common.download(src, dst)
コード例 #11
0
def main():
    common.setup()
    usage = 'USAGE: {0} hdfs_src gs_dst'.format(common.script_name())
    if len(sys.argv) != 3:
        print usage
        sys.exit(1)

    src = sys.argv[1]
    if sys.argv[2].startswith('gs://') or not sys.argv[2].startswith('/'):
        print usage
        print(
            'gs_dst should be of the form /path/to/object. gs://{0} will be '
            'prefixed for you.').format(cfg.gs_bucket)
        sys.exit(1)
    dst = 'gs://{0}{1}'.format(cfg.gs_bucket, sys.argv[2])

    common.download(src, dst)
コード例 #12
0
    def __init__(self, project_file=None):
        # session folder (acts as root path for job specific folders)
        self.session_folder = '../sessions'

        # configuration engines
        self.config = None
        self.option = None

        # project metadata
        self.project = None
        self.namespace = None

        # project resources
        self.database = None
        self.schedule = None

        # project dataset specific working folders
        self.state_folder = None
        self.work_folder = None
        self.publish_folder = None

        # project database connections (db_conn)
        self.source_db_conn = None
        self.target_db_conn = None

        # project file and name
        self.project_file = ''
        self.project_name = ''

        # if optional project file supplied use it; otherwise try command line
        if project_file:
            self.project_file = project_file
        elif len(sys.argv) > 1:
            self.project_file = sys.argv[1]

        # make sure we have a valid project file
        app_name = script_name()
        if not self.project_file:
            print(f'{app_name}: error - must specify project file')
        elif not is_file(f'../conf/{self.project_file}'):
            print(
                f'{app_name}: error - project file not found ({project_file})')
        else:
            # project file controls configuration
            self.project_name = just_file_stem(self.project_file)
コード例 #13
0
 def log_exception(e):
     log_file = script_name() + '.log'
     with open(log_file, 'a') as output_stream:
         output_stream.write(f'{datetime.datetime.now()}\n{e}\n')